# Video Game Exploratory Data Analysis

In [None]:
# Imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Read in the data set:
df = pd.read_csv('Video Game Data Proj/Resources/games_dataset.csv')

In [None]:
# Display the first 10 rows of the DataFrame:
df.head(10)

In [None]:
# Display a concise summary of your DataFrame
print("\nDataFrame Info:")
df.info()

In [None]:
# Check for missing values:
missing_values = df.isnull().sum()

# Check if there are any missing values at all
if missing_values.sum() == 0:
    print("\nGreat! There are no missing values in the dataset.")
else:
    print("\nThere are missing values in the dataset. Consider handling them appropriately.")

# Statisics of the data:

In [None]:
# Numeric Columns:
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Calculate statistics
stats = df[numeric_columns].agg(['min', 'max', 'mean', 'median', 'std'])

# Rename the columns for clarity:
stats.index = ['Minimum', 'Maximum', 'Mean', 'Median', 'Standard Deviation']

In [None]:
# Calculation for the 'User Rating':
if 'User Rating' in df.columns:
    print("\nStatistics for User Rating:")
    print(f"Minimum: {df['User Rating'].min()}")
    print(f"Maximum: {df['User Rating'].max()}")
    print(f"Mean: {df['User Rating'].mean()}")
    print(f"Median: {df['User Rating'].median()}")
    print(f"Standard Deviation: {df['User Rating'].std()}")

In [None]:
# Tob 5 Genre:
genre_counts = df['Genre'].value_counts()
top_5_genres = genre_counts.nlargest(5)

print('The top 5 genres are:')
for genre, count in top_5_genres.items():
    print(f'{genre}: {count}')

In [None]:
# Percentage of games in each genre:
genre_percentages = genre_counts / len(df) * 100
print('Percentage of games in each genre:')
for genre, percentage in genre_percentages.items():
    print(f'{genre}: {percentage:.2f}%')

In [None]:
# Platform distribution:
platform_counts = df['Platform'].value_counts()
print(platform_counts)

In [None]:
# Percentage of games in each genre:
platform_percentages = platform_counts / len(df) * 100
print('Percentage of games in each platform:')
for Platform, percentage in platform_percentages.items():
    print(f'{Platform}: {percentage:.2f}%')

In [None]:
# List the oldest year value and most recen year value:
print('Oldest and most recent year:')
print(df['Release Year'].min())
print(df['Release Year'].max())
print('Description of data:')
print(df['Release Year'].describe())

# Graphic Representation of Video Game Sales

 Distribution of the number of games released per year:

In [None]:
# Distribution of year releases by years; 2000-2005, 2006-2010, 2011-2015, 2016-2020, 2021-2023:
def release_year_distribution(df):
    #Bins for year ranges:
    bins = [2000, 2005, 2010, 2015, 2020, 2023]
    labels = ['2000-2005', '2006-2010', '2011-2015', '2016-2020', '2021-2023']

    # Create a new column with binned years and count the number of games in each bin:
    df['year_range'] = pd.cut(df['Release Year'], bins=bins, labels=labels)

    year_counts = df['year_range'].value_counts().sort_index()

    # Create a bar plot:
    plt.figure(figsize=(10, 6))
    plt.bar(year_counts.index, year_counts.values)
    plt.xlabel('Year Range')
    plt.ylabel('Number of Games')
    plt.title('Distribution of Year Releases by Years')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    return year_counts

# Convert 'Release Year' column to numeric type
df['Release Year'] = pd.to_numeric(df['Release Year'], errors='coerce')

# Drop rows with NaN values in 'Release Year' column
df = df.dropna(subset=['Release Year'])

release_year_distribution(df)

Genre distribution of the data:

In [None]:
def genre_horizontal_bar_chart(df):
    # Calculate genre distribution
    genre_counts = df['Genre'].value_counts()
    genre_percentages = df['Genre'].value_counts(normalize=True) * 100

    # Create a DataFrame for the distribution
    genre_dist = pd.DataFrame({'Count': genre_counts, 'Percentage': genre_percentages})
    genre_dist = genre_dist.sort_values('Count', ascending=True)  # Sort ascending for horizontal bar chart

    # Create the plot
    plt.figure(figsize=(12, 10))  # Adjust the figure size as needed
    
    # Create horizontal bar plot
    bars = plt.barh(genre_dist.index, genre_dist['Count'], color=sns.color_palette("husl", len(genre_dist)))
    
    # Customize the plot
    plt.title('Genre Distribution', fontsize=16)
    plt.xlabel('Number of Games', fontsize=12)
    plt.ylabel('Genre', fontsize=12)
    
    # Remove top and right spines
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Add value labels to the end of each bar
    for i, bar in enumerate(bars):
        width = bar.get_width()
        plt.text(width, bar.get_y() + bar.get_height()/2, f'{width} ({genre_dist["Percentage"].iloc[i]:.1f}%)', 
                 ha='left', va='center', fontweight='bold')
    
    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

    # Print top 5 genres
    print("Top 5 Genres:")
    for genre, row in genre_dist.sort_values('Count', ascending=False).head().iterrows():
        print(f"{genre}: {row['Count']} ({row['Percentage']:.2f}%)")

# Call the function
genre_horizontal_bar_chart(df)

Heatmap of average settings by genres:

In [None]:
# Heatmap of average user ratings by genre and release year:
def genre_ratings_heatmap(df):
    # Ensure 'Release Year' is treated as a categorical variable
    df['Release Year'] = df['Release Year'].astype(str)
    
    # Group the data by genre and year, and calculate the average rating for each group
    genre_ratings = df.groupby(['Genre', 'Release Year'])['User Rating'].mean().reset_index()
    
    # Pivot the data to create a matrix suitable for a heatmap
    heatmap_data = genre_ratings.pivot(index='Genre', columns='Release Year', values='User Rating')
    
    # Create the plot
    plt.figure(figsize=(16, 12))  # Adjust figure size as needed
    
    # Create heatmap using seaborn
    sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.2f', cbar_kws={'label': 'Average Rating'})
    
    # Set the title and axis labels
    plt.title('Average Ratings by Genre and Year', fontsize=16)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Genre', fontsize=12)
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha='right')
    
    # Adjust layout to prevent cutoff of labels
    plt.tight_layout()
    
    # Show the plot
    plt.show()
    
    # Print some additional information
    print("Genres with highest average ratings:")
    top_genres = genre_ratings.groupby('Genre')['User Rating'].mean().sort_values(ascending=False).head()
    print(top_genres)
    
    print("\nYears with highest average ratings:")
    top_years = genre_ratings.groupby('Release Year')['User Rating'].mean().sort_values(ascending=False).head()
    print(top_years)

# Assuming df is your DataFrame
genre_ratings_heatmap(df)

Pie chart of platforms:

In [None]:
# Broken pie chart:
def platform_games_analysis(df):
    # Calculate the platform counts
    platform_counts = df['Platform'].value_counts()

    # Create a new figure with a larger size
    plt.figure(figsize=(14, 10))

    # Define a darker color palette
    colors = plt.cm.Dark2(np.arange(len(platform_counts)) / len(platform_counts))
    
    # Make colors darker
    colors = np.array([np.array(c) * 0.8 for c in colors])

    # Create the pie chart
    wedges, texts, autotexts = plt.pie(platform_counts.values, 
                                       labels=platform_counts.index, 
                                       autopct='%1.1f%%', 
                                       startangle=90, 
                                       explode=[0.05] * len(platform_counts),
                                       colors=colors,
                                       shadow=False)  # Add shadow for 3D effect

    # Enhance the appearance of text - make labels and percentages bold
    plt.setp(autotexts, size=10, weight="bold", color="Black")
    plt.setp(texts, size=12, weight="bold")  # Make wedge labels bold

    # Add a title
    plt.title('Platform Market Share', fontsize=20, fontweight='bold')

    # Equal aspect ratio ensures that pie is drawn as a circle
    plt.axis('equal')  

    # Add a legend with bold text
    legend = plt.legend(title='Platforms', loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))
    plt.setp(legend.get_title(), fontweight='bold')  # Make legend title bold
    for text in legend.get_texts():
        text.set_fontweight('bold')  # Make legend text bold

    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

    # Print additional information
    print("Total number of games by platform:")
    for platform, count in platform_counts.items():
        print(f"{platform}: {count} games ({count/sum(platform_counts.values):.1%})")

# Assuming df is your DataFrame
platform_games_analysis(df)

Stakcker Chart with platform data:

In [None]:
def create_stacked_chart(df):
    # Create a cross-tabulation of Platform and Genre:
    platform_genre = pd.crosstab(df['Platform'], df['Genre'])

    # Create a stacked bar chart:
    ax = platform_genre.plot(kind='bar', stacked=True, figsize=(15, 10))

    # Customize the chart:
    plt.title('Distribution of Games by Platform and Genre', fontsize=20, fontweight='bold')
    plt.xlabel('Platform', fontsize=14, fontweight='bold')
    plt.ylabel('Number of Games', fontsize=14, fontweight='bold')
    
    # Rotate x-axis labels for better readability:
    plt.xticks(rotation=45, ha='right', fontweight='bold')
    plt.yticks(fontweight='bold')

    # Add a legend:
    plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.setp(ax.get_legend().get_title(), fontsize='14', fontweight='bold')

    # Add value labels on the bars:
    for c in ax.containers:
        ax.bar_label(c, label_type='center', fontsize=8)

    # Adjust layout and display the plot:
    plt.tight_layout()
    plt.show()

    # Print summary statistics:
    print("Total number of games by platform:")
    platform_totals = platform_genre.sum(axis=1).sort_values(ascending=False)
    for platform, count in platform_totals.items():
        print(f"{platform}: {count} games")

    print("\nTop 3 genres for each platform:")
    for platform in platform_genre.index:
        top_genres = platform_genre.loc[platform].nlargest(3)
        print(f"\n{platform}:")
        for genre, count in top_genres.items():
            print(f"  {genre}: {count} games")

# Assuming df is your DataFrame:
create_stacked_chart(df)

Stacked Bar Chart for top 3 genres  with platform by year:

In [None]:
# Stacked bar chart top 3 genre by years:
def create_top_genres_by_year_chart(df):
    # Group by Year and Genre, count the games
    year_genre_counts = df.groupby(['Release Year', 'Genre']).size().unstack(fill_value=0)
    
    # For each year, keep only the top 3 genres
    top_3_genres = year_genre_counts.apply(lambda x: x.nlargest(3), axis=1)
    
    # Fill NaN values with 0
    top_3_genres = top_3_genres.fillna(0)
    
    # Sort years in ascending order
    top_3_genres = top_3_genres.sort_index()

    # Create a stacked bar chart
    ax = top_3_genres.plot(kind='bar', stacked=True, figsize=(15, 10))

    # Customize the chart
    plt.title('Top 3 Genres by Year', fontsize=20, fontweight='bold')
    plt.xlabel('Release Year', fontsize=14, fontweight='bold')
    plt.ylabel('Number of Games', fontsize=14, fontweight='bold')
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha='right', fontweight='bold')
    plt.yticks(fontweight='bold')

    # Add a legend
    plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.setp(ax.get_legend().get_title(), fontsize='14', fontweight='bold')

    # Add value labels on the bars
    for c in ax.containers:
        ax.bar_label(c, label_type='center', fontsize=8)

    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

    # Print summary statistics
    print("Top 3 genres for each year:")
    for year in top_3_genres.index:
        top_genres = top_3_genres.loc[year].nlargest(3)
        print(f"\n{year}:")
        for genre, count in top_genres.items():
            if count > 0:  # Only print if there are games in this genre
                print(f"  {genre}: {count} games")

# Assuming df is your DataFrame
create_top_genres_by_year_chart(df)


Bubble Charts:

In [None]:
def top_5_genres_performance_over_time(df):
    # Calculate overall average rating for each genre
    overall_avg = df.groupby('Genre')['User Rating'].mean().sort_values(ascending=False)
    
    # Select top 5 genres
    top_5_genres = overall_avg.head(5).index.tolist()

    # Filter the dataframe for top 5 genres
    df_top_5 = df[df['Genre'].isin(top_5_genres)]

    # Group by Year and Genre, calculate average rating
    genre_performance = df_top_5.groupby(['Release Year', 'Genre'])['User Rating'].mean().unstack()

    # Create the line plot
    plt.figure(figsize=(16, 10))
    
    # Set a color palette
    colors = sns.color_palette("husl", n_colors=5)
    
    # Plot each genre
    for i, genre in enumerate(top_5_genres):
        plt.plot(genre_performance.index, genre_performance[genre], marker='o', label=genre, linewidth=2, color=colors[i])

    # Customize the chart
    plt.title('Top 5 Genres Performance Over Time', fontsize=20, fontweight='bold')
    plt.xlabel('Release Year', fontsize=14, fontweight='bold')
    plt.ylabel('Average User Rating', fontsize=14, fontweight='bold')
    
    plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Set y-axis to start from 0
    plt.ylim(bottom=3)
    
    plt.tight_layout()
    plt.show()

    # Print summary statistics
    print("Top 5 genres by overall average rating:")
    for genre, avg_rating in overall_avg.head().items():
        print(f"{genre}: {avg_rating:.2f}")

    print("\nPerformance change for top 5 genres (comparing first and last year):")
    first_year = genre_performance.index.min()
    last_year = genre_performance.index.max()
    improvement = genre_performance.loc[last_year] - genre_performance.loc[first_year]
    for genre in top_5_genres:
        change = improvement[genre]
        print(f"{genre}: {change:.2f} change in rating")

# Assuming df is your DataFrame
top_5_genres_performance_over_time(df)