In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("games1.csv")

In [None]:
df.head(3)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df[df.name == "The Simsâ„¢ 4"].iloc[0]

# Describing Relationships with Correlations

## <font color='green'>**Is the a relationship between playtime and metacritic score?**</font>

In [None]:
df.plot("metacritic_score", "median_playtime_forever", kind="scatter");

In [None]:
(df[
    (df.metacritic_score > 0)
    & (df.median_playtime_forever > 0)
    & (df.median_playtime_forever < 2000)
    ]
 .plot(
     x="metacritic_score",
     y="median_playtime_forever",
     kind="scatter",
     title="Metacritic Score and Median Total Playtime")
);

In [None]:
from scipy.stats import pearsonr

In [None]:
result = pearsonr(df.metacritic_score, df.user_score)

In [None]:
print("Pearson's r:", round(result.statistic, 3))
print("p-value:", round(result.pvalue, 3))

# Finding Differences with Hypothesis Tests

## <font color='green'>**Are multiplayer games rated more highly than single-player games?**</font>

In [None]:
# Drop rows with missing categories
df_ttest = df.dropna(subset=["categories"]).copy()

# Determine player group directly
def classify_player_group(categories):
    has_single = "Single-player" in categories
    has_multi = "Multi-player" in categories
    if has_single and not has_multi:
        return "single"
    elif has_multi and not has_single:
        return "multi"
    else:
        return "other"

df_ttest["player_group"] = df_ttest.categories.apply(classify_player_group)

# Keep only single or multi
df_ttest = df_ttest[df_ttest["player_group"].isin(["single", "multi"])]

In [None]:
df_ttest.player_group.value_counts()

In [None]:
df_ttest[df_ttest.metacritic_score > 0].groupby("player_group").metacritic_score.mean()

In [None]:
from scipy.stats import ttest_ind

In [None]:
t_stat, p_value = ttest_ind(
    df_ttest[(df_ttest.player_group == "single") & (df_ttest.metacritic_score > 0)].metacritic_score,
    df_ttest[(df_ttest.player_group == "multi") & (df_ttest.metacritic_score > 0)].metacritic_score
    )

print("p-value:", round(p_value, 3))

# Gaining insights using interactive visualizations 

## <font color='green'>**What are the most popular games in each genre?**</font>

In [None]:
import altair as alt

In [None]:
df["genres"] = df.genres.apply(lambda x: x.split(",")[0] if type(x) != float else "other")
all_genres = []

for idx, row in df[(df.median_playtime_two_weeks > 0) & (df.metacritic_score > 0)].dropna(subset=["genres"]).iterrows():
  genres = row.genres.split(",")
  all_genres.append(genres)


all_genres_flat = [item for sublist in all_genres for item in sublist]

unique_genre_options = set(all_genres_flat)

In [None]:
point_selector = alt.selection_point(on='mouseover', nearest=True)
input_dropdown = alt.binding_select(options=unique_genre_options, name='Genre')
genre_selector = alt.selection_point(fields=['genres'], bind=input_dropdown)

chart = alt.Chart(df[(df.median_playtime_two_weeks > 0) & (df.metacritic_score > 0)]).mark_circle(size=100).encode(
    alt.X("median_playtime_two_weeks", type="quantitative"),
    alt.Y("metacritic_score", type="quantitative", scale=alt.Scale(domain=[50, 110])),
    tooltip=["name"],
    color = alt.condition(point_selector, alt.value("salmon"), alt.value('lightgray'))
).add_params(point_selector, genre_selector
).transform_filter(
    genre_selector
)


chart.interactive()