Step 1: Open the data file and study the general information

In [None]:
import pandas as pd

# Load the data
file_path = '/datasets/games.csv'
df = pd.read_csv(file_path)

# Display general information
print(df.info())
print(df.describe())
print(df.head())


Step 2: Prepare the data
Replace the column names (make them lowercase)

In [None]:
# Make column names lowercase
df.columns = df.columns.str.lower()


Convert the data to the required types

In [None]:
# Convert columns to appropriate types if necessary
# Example: converting 'year_of_release' to integer (fill NaNs first)
df['year_of_release'].fillna(0, inplace=True)  # Temporary fill for conversion
df['year_of_release'] = df['year_of_release'].astype(int)

Deal with missing values

In [None]:
# Handling missing values in 'rating'
df['rating'].fillna('undefined', inplace=True)

# Fill missing values for 'name' and 'genre' (drop if < 1% of data)
threshold = len(df) * 0.01
missing_name_genre = df[df[['name', 'genre']].isna().all(axis=1)]
if len(missing_name_genre) < threshold:
    df.dropna(subset=['name', 'genre'], how='all', inplace=True)

# Reason for missing values: could be due to incomplete data entry, lack of availability, or TBD cases.
# Handling 'TBD'
df['rating'].replace('TBD', 'undefined', inplace=True)


Calculate total sales

In [None]:
# Calculate total sales and add as a new column
df['total_sales'] = df[['na_sales', 'eu_sales', 'jp_sales', 'other_sales']].sum(axis=1)


Step 3: Analyze the data

Number of games released in different years

In [None]:
# Number of games released each year
games_per_year = df['year_of_release'].value_counts().sort_index()
print(games_per_year)


Sales variation from platform to platform

In [None]:
# Sales variation by platform
platform_sales = df.groupby('platform')['total_sales'].sum().sort_values(ascending=False)
print(platform_sales)


Determine relevant period for analysis (e.g., the last 10 years)

In [None]:
# Determine period for analysis
recent_years = df[df['year_of_release'] >= 2006]


Leading platforms in sales

In [None]:
# Platforms leading in sales
top_platforms = recent_years.groupby('platform')['total_sales'].sum().sort_values(ascending=False).head()
print(top_platforms)


Box plot for global sales by platform

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Box plot for global sales by platform
plt.figure(figsize=(12, 8))
sns.boxplot(x='platform', y='total_sales', data=recent_years)
plt.title('Global Sales by Platform')
plt.show()


User and professional reviews impact on sales (scatter plot)

In [None]:
# Correlation between reviews and sales for a popular platform (e.g., PS4)
popular_platform = recent_years[recent_years['platform'] == 'PS4']
plt.figure(figsize=(12, 8))
sns.scatterplot(x='user_score', y='total_sales', data=popular_platform)
sns.scatterplot(x='critic_score', y='total_sales', data=popular_platform)
plt.title('User and Professional Reviews vs Sales (PS4)')
plt.show()

# Calculate correlation
user_corr = popular_platform['user_score'].corr(popular_platform['total_sales'])
critic_corr = popular_platform['critic_score'].corr(popular_platform['total_sales'])
print(f"User Score Correlation: {user_corr}")
print(f"Critic Score Correlation: {critic_corr}")


General distribution of games by genre

In [None]:
# Distribution of games by genre
genre_sales = recent_years.groupby('genre')['total_sales'].sum().sort_values(ascending=False)
print(genre_sales)


Step 4: Create a user profile for each region

Top platforms and genres in each region

In [None]:
# Top platforms and genres in NA, EU, JP
regions = ['na', 'eu', 'jp']

for region in regions:
    top_platforms = df.groupby('platform')[f'{region}_sales'].sum().sort_values(ascending=False).head(5)
    top_genres = df.groupby('genre')[f'{region}_sales'].sum().sort_values(ascending=False).head(5)
    print(f"Top platforms in {region.upper()}:\n{top_platforms}\n")
    print(f"Top genres in {region.upper()}:\n{top_genres}\n")

# ESRB rating impact on sales in each region
for region in regions:
    rating_sales = df.groupby('rating')[f'{region}_sales'].sum().sort_values(ascending=False)
    print(f"ESRB Rating impact in {region.upper()}:\n{rating_sales}\n")


Step 5: Test hypotheses

Hypothesis testing