# Lab 3: Complete Pandas Data Analysis
# Video Game Sales Analysis

## üìö **Learning Objectives**
By the end of this lab, you will be able to:
- Create and manipulate pandas Series and DataFrames
- Load, clean, and preprocess real-world datasets
- Perform data selection, filtering, and transformation
- Conduct statistical analysis and aggregation operations
- Create meaningful visualizations to uncover insights
- Apply advanced pandas techniques for data analysis

## üìä **Dataset Overview**
We'll analyze a comprehensive video game sales dataset containing information about:
- Game titles, platforms, and release years
- Sales figures across different regions (NA, EU, JP, Other)
- Game genres and publishers
- Global sales performance

---

## 1Ô∏è‚É£ Import Required Libraries

## 2Ô∏è‚É£ Load and Explore Dataset

In [2]:
# Load the video game sales dataset
print("üéÆ Loading Video Game Sales Dataset...")
game_df = pd.read_csv('vgsales.csv')

print("‚úÖ Dataset loaded successfully!")
print(f"üìè Dataset Shape: {game_df.shape}")
print(f"üìÖ Data Range: {game_df['Year'].min():.0f} - {game_df['Year'].max():.0f}")

# Display basic information about the dataset
print("\n" + "="*60)
print("üìä DATASET OVERVIEW")
print("="*60)

print("\nüè∑Ô∏è Column Names:")
print(game_df.columns.tolist())

print("\nüìã Data Types:")
print(game_df.dtypes)

print("\nüîç First 5 Rows:")
print(game_df.head())

print("\nüìà Dataset Info:")
game_df.info()

üéÆ Loading Video Game Sales Dataset...
‚úÖ Dataset loaded successfully!
üìè Dataset Shape: (16598, 11)
üìÖ Data Range: 1980 - 2020

üìä DATASET OVERVIEW

üè∑Ô∏è Column Names:
['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']

üìã Data Types:
Rank              int64
Name             object
Platform         object
Year            float64
Genre            object
Publisher        object
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object

üîç First 5 Rows:
   Rank                      Name Platform    Year         Genre Publisher  NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales
0     1                Wii Sports      Wii  2006.0        Sports  Nintendo     41.49     29.02      3.77         8.46         82.74
1     2         Super Mario Bros.      NES  1985.0      Platform  Nintendo     29.08      3.58      6.81         

## 3Ô∏è‚É£ Data Cleaning and Preprocessing

In [3]:
# Data Cleaning and Preprocessing
print("üßπ DATA CLEANING AND PREPROCESSING")
print("="*50)

# 1. Check for missing values
print("üîç 1. Missing Values Analysis:")
missing_values = game_df.isnull().sum()
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

# Show only columns with missing values
missing_cols = missing_values[missing_values > 0]
if len(missing_cols) > 0:
    print("\nüìä Columns with missing values:")
    for col, count in missing_cols.items():
        percentage = (count / len(game_df)) * 100
        print(f"  {col}: {count} ({percentage:.2f}%)")

# 2. Handle missing values
print(f"\nüîß 2. Data Cleaning:")
print(f"Original shape: {game_df.shape}")

# Remove rows where Year is missing (most critical for analysis)
game_df_clean = game_df.dropna(subset=['Year']).copy()
print(f"After removing missing Years: {game_df_clean.shape}")
print(f"Rows removed: {game_df.shape[0] - game_df_clean.shape[0]}")

# 3. Check for duplicates
print(f"\nüîÑ 3. Duplicate Analysis:")
duplicates = game_df_clean.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# 4. Data type optimization
print(f"\n‚ö° 4. Data Type Optimization:")
print("Before optimization:")
print(game_df_clean.dtypes)

# Convert Year to integer (it might be float due to missing values)
game_df_clean['Year'] = game_df_clean['Year'].astype(int)

print(f"\n‚úÖ Data cleaning completed!")
print(f"Final dataset shape: {game_df_clean.shape}")

# Display sample of cleaned data
print(f"\nüìã Sample of cleaned data:")
print(game_df_clean.head())

üßπ DATA CLEANING AND PREPROCESSING
üîç 1. Missing Values Analysis:
Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

Total missing values: 329

üìä Columns with missing values:
  Year: 271 (1.63%)
  Publisher: 58 (0.35%)

üîß 2. Data Cleaning:
Original shape: (16598, 11)
After removing missing Years: (16327, 11)
Rows removed: 271

üîÑ 3. Duplicate Analysis:
Number of duplicate rows: 0

‚ö° 4. Data Type Optimization:
Before optimization:
Rank              int64
Name             object
Platform         object
Year            float64
Genre            object
Publisher        object
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object

‚úÖ Data cleaning completed!
Final dataset shape: (16327, 11)

üìã Sample of cleaned data

## 4Ô∏è‚É£ Basic Pandas Operations
Demonstrating fundamental Series and DataFrame operations

In [None]:
# Basic Pandas Operations
print("üîß BASIC PANDAS OPERATIONS")
print("="*50)

# 1. Working with Series
print("üìä 1. Series Operations:")
# Create a Series from a column
sales_series = game_df_clean['Global_Sales']
print(f"Series type: {type(sales_series)}")
print(f"Series shape: {sales_series.shape}")
print(f"Series dtype: {sales_series.dtype}")

# Basic Series operations
print(f"\nSeries Statistics:")
print(f"  Count: {sales_series.count()}")
print(f"  Mean: {sales_series.mean():.2f}")
print(f"  Median: {sales_series.median():.2f}")
print(f"  Std: {sales_series.std():.2f}")
print(f"  Min: {sales_series.min():.2f}")
print(f"  Max: {sales_series.max():.2f}")

# 2. DataFrame Info
print(f"\nüóÇÔ∏è  2. DataFrame Information:")
print(f"Shape: {game_df_clean.shape}")
print(f"Columns: {list(game_df_clean.columns)}")
print(f"Index type: {type(game_df_clean.index)}")

# 3. Column operations
print(f"\nüìã 3. Column Operations:")
# Adding new column
game_df_clean['Sales_Category'] = pd.cut(game_df_clean['Global_Sales'], 
                                         bins=[0, 1, 5, 20, 100], 
                                         labels=['Low', 'Medium', 'High', 'Very High'])
print("Added 'Sales_Category' column based on Global_Sales")

# Dropping columns (create a copy first)
df_temp = game_df_clean.copy()
df_dropped = df_temp.drop(['Rank'], axis=1)
print(f"Dropped 'Rank' column. New shape: {df_dropped.shape}")

# 4. Row operations
print(f"\nüìù 4. Row Operations:")
# Sampling rows
sample_df = game_df_clean.sample(n=5, random_state=42)
print("Random sample of 5 rows:")
print(sample_df[['Name', 'Platform', 'Global_Sales']].to_string())

# 5. Index operations
print(f"\nüîó 5. Index Operations:")
# Set Name as index
df_indexed = game_df_clean.set_index('Name')
print(f"Set 'Name' as index. Shape: {df_indexed.shape}")

# Reset index
df_reset = df_indexed.reset_index()
print(f"Reset index. Shape: {df_reset.shape}")

print(f"\n‚úÖ Basic operations completed!")

## 5Ô∏è‚É£ Data Selection and Filtering
Advanced data selection, filtering, and conditional operations

In [None]:
# Data Selection and Filtering
print("üéØ DATA SELECTION AND FILTERING")
print("="*50)

# 1. Column Selection
print("üìã 1. Column Selection:")
# Single column
single_col = game_df_clean['Name']
print(f"Single column type: {type(single_col)}")

# Multiple columns
multi_cols = game_df_clean[['Name', 'Platform', 'Global_Sales']]
print(f"Multiple columns shape: {multi_cols.shape}")

# 2. Row Selection with loc and iloc
print(f"\nüîç 2. Row Selection:")
# Using iloc (integer position)
first_5 = game_df_clean.iloc[:5]
print(f"First 5 rows using iloc: {first_5.shape}")

# Using loc (label-based)
sample_rows = game_df_clean.loc[10:15, ['Name', 'Genre', 'Global_Sales']]
print("Sample rows using loc:")
print(sample_rows.to_string())

# 3. Boolean Indexing
print(f"\n‚úÖ 3. Boolean Indexing:")
# High sales games (>10 million)
high_sales = game_df_clean[game_df_clean['Global_Sales'] > 10]
print(f"Games with >10M sales: {len(high_sales)}")

# Multiple conditions
nintendo_high_sales = game_df_clean[
    (game_df_clean['Publisher'] == 'Nintendo') & 
    (game_df_clean['Global_Sales'] > 5)
]
print(f"Nintendo games with >5M sales: {len(nintendo_high_sales)}")

# 4. String filtering
print(f"\nüî§ 4. String Filtering:")
# Games with "Super" in the name
super_games = game_df_clean[game_df_clean['Name'].str.contains('Super', na=False)]
print(f"Games with 'Super' in name: {len(super_games)}")

# 5. Query method
print(f"\nüîç 5. Query Method:")
# Using query for complex conditions
recent_popular = game_df_clean.query('Year >= 2010 and Global_Sales > 2')
print(f"Popular games since 2010: {len(recent_popular)}")

# 6. isin method
print(f"\nüìù 6. isin Method:")
# Filter by multiple genres
action_sports = game_df_clean[game_df_clean['Genre'].isin(['Action', 'Sports'])]
print(f"Action and Sports games: {len(action_sports)}")

# 7. Top N selection
print(f"\nüèÜ 7. Top N Selection:")
# Top 10 best-selling games
top_10 = game_df_clean.nlargest(10, 'Global_Sales')
print("Top 10 Best-Selling Games:")
print(top_10[['Name', 'Platform', 'Global_Sales']].to_string())

# Bottom 5 worst-selling games (with sales > 0)
bottom_5 = game_df_clean[game_df_clean['Global_Sales'] > 0].nsmallest(5, 'Global_Sales')
print(f"\nBottom 5 games (sales > 0):")
print(bottom_5[['Name', 'Platform', 'Global_Sales']].to_string())

print(f"\n‚úÖ Selection and filtering completed!")

## 6Ô∏è‚É£ Statistical Analysis
Comprehensive statistical analysis and descriptive statistics

In [None]:
# Statistical Analysis
print("üìä STATISTICAL ANALYSIS")
print("="*50)

# 1. Descriptive Statistics
print("üìà 1. Descriptive Statistics:")
numeric_cols = ['Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
desc_stats = game_df_clean[numeric_cols].describe()
print(desc_stats)

# 2. Individual Statistical Measures
print(f"\nüî¢ 2. Individual Statistical Measures:")
global_sales = game_df_clean['Global_Sales']

print(f"Count: {global_sales.count()}")
print(f"Mean: {global_sales.mean():.3f}")
print(f"Median: {global_sales.median():.3f}")
print(f"Mode: {global_sales.mode().iloc[0]:.3f}")
print(f"Standard Deviation: {global_sales.std():.3f}")
print(f"Variance: {global_sales.var():.3f}")
print(f"Skewness: {global_sales.skew():.3f}")
print(f"Kurtosis: {global_sales.kurtosis():.3f}")

# Quartiles and IQR
q1 = global_sales.quantile(0.25)
q3 = global_sales.quantile(0.75)
iqr = q3 - q1
print(f"Q1 (25th percentile): {q1:.3f}")
print(f"Q3 (75th percentile): {q3:.3f}")
print(f"IQR: {iqr:.3f}")

# 3. Correlation Analysis
print(f"\nüîó 3. Correlation Analysis:")
correlation_matrix = game_df_clean[numeric_cols].corr()
print("Correlation Matrix:")
print(correlation_matrix.round(3))

# Find highest correlations
print(f"\nHighest correlations with Global_Sales:")
correlations = correlation_matrix['Global_Sales'].abs().sort_values(ascending=False)
for col, corr in correlations.items():
    if col != 'Global_Sales':
        print(f"  {col}: {corr:.3f}")

# 4. Value Counts for Categorical Data
print(f"\nüìä 4. Categorical Data Analysis:")
print("Top 10 Genres by count:")
print(game_df_clean['Genre'].value_counts().head(10))

print(f"\nTop 10 Publishers by count:")
print(game_df_clean['Publisher'].value_counts().head(10))

print(f"\nTop 10 Platforms by count:")
print(game_df_clean['Platform'].value_counts().head(10))

# 5. Percentiles
print(f"\nüìè 5. Percentiles Analysis:")
percentiles = [10, 25, 50, 75, 90, 95, 99]
print("Global Sales Percentiles:")
for p in percentiles:
    value = global_sales.quantile(p/100)
    print(f"  {p}th percentile: {value:.3f}")

# 6. Outlier Detection using IQR
print(f"\nüö® 6. Outlier Detection:")
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = global_sales[(global_sales < lower_bound) | (global_sales > upper_bound)]
print(f"Number of outliers: {len(outliers)}")
print(f"Outlier percentage: {(len(outliers)/len(global_sales)*100):.2f}%")

# Show top outliers
top_outliers = game_df_clean[game_df_clean['Global_Sales'].isin(outliers.nlargest(5))]
print(f"\nTop 5 outliers (highest sales):")
print(top_outliers[['Name', 'Platform', 'Global_Sales']].to_string())

print(f"\n‚úÖ Statistical analysis completed!")

## 7Ô∏è‚É£ Grouping and Aggregation
Advanced groupby operations, pivot tables, and multi-level aggregations

In [None]:
# Grouping and Aggregation
print("üîó GROUPING AND AGGREGATION")
print("="*50)

# 1. Basic GroupBy Operations
print("üìä 1. Basic GroupBy Operations:")
# Group by Genre
genre_sales = game_df_clean.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)
print("Total Sales by Genre:")
print(genre_sales.head(10))

# Multiple aggregations
genre_stats = game_df_clean.groupby('Genre')['Global_Sales'].agg(['count', 'mean', 'sum', 'std']).round(3)
print(f"\nGenre Statistics:")
print(genre_stats.head())

# 2. Multiple Column Grouping
print(f"\nüè¢ 2. Multiple Column Grouping:")
publisher_platform = game_df_clean.groupby(['Publisher', 'Platform'])['Global_Sales'].sum().sort_values(ascending=False)
print("Top Publisher-Platform combinations:")
print(publisher_platform.head(10))

# 3. Custom Aggregation Functions
print(f"\n‚öôÔ∏è 3. Custom Aggregation Functions:")
def sales_range(series):
    return series.max() - series.min()

custom_agg = game_df_clean.groupby('Genre')['Global_Sales'].agg({
    'total': 'sum',
    'average': 'mean',
    'games_count': 'count',
    'max_sales': 'max',
    'sales_range': sales_range
}).round(3)
print("Custom aggregations by Genre:")
print(custom_agg.head())

# 4. Transform and Apply
print(f"\nüîÑ 4. Transform and Apply:")
# Add genre mean to each row
game_df_clean['Genre_Avg_Sales'] = game_df_clean.groupby('Genre')['Global_Sales'].transform('mean')
print("Added Genre_Avg_Sales column")

# Compare each game to genre average
game_df_clean['Above_Genre_Avg'] = game_df_clean['Global_Sales'] > game_df_clean['Genre_Avg_Sales']
above_avg_count = game_df_clean['Above_Genre_Avg'].sum()
print(f"Games above their genre average: {above_avg_count}")

# 5. Pivot Tables
print(f"\nüìã 5. Pivot Tables:")
# Create pivot table: Genre vs Platform
pivot_genre_platform = pd.pivot_table(
    game_df_clean, 
    values='Global_Sales', 
    index='Genre', 
    columns='Platform', 
    aggfunc='mean', 
    fill_value=0
).round(3)
print("Pivot Table - Average Sales by Genre and Platform (showing first 5 genres and platforms):")
print(pivot_genre_platform.iloc[:5, :5])

# 6. Cross-tabulation
print(f"\nüìä 6. Cross-tabulation:")
# Count of games by Genre and Era
game_df_clean['Era'] = pd.cut(game_df_clean['Year'], 
                             bins=[1980, 1995, 2005, 2015, 2020], 
                             labels=['Early (1980-1995)', 'Classic (1996-2005)', 
                                   'Modern (2006-2015)', 'Current (2016-2020)'])

crosstab = pd.crosstab(game_df_clean['Genre'], game_df_clean['Era'])
print("Games count by Genre and Era:")
print(crosstab)

# 7. Multi-level Grouping with unstack
print(f"\nüìö 7. Multi-level Grouping:")
multi_group = game_df_clean.groupby(['Genre', 'Platform'])['Global_Sales'].mean().unstack(fill_value=0)
print("Multi-level grouping (Genre-Platform average sales):")
print(multi_group.iloc[:5, :5])  # Show subset

# 8. Filtering Groups
print(f"\nüîç 8. Filtering Groups:")
# Only genres with more than 100 games
popular_genres = game_df_clean.groupby('Genre').filter(lambda x: len(x) > 100)
print(f"Games in genres with >100 titles: {len(popular_genres)}")

# Show those popular genres
popular_genre_list = popular_genres['Genre'].value_counts()
print("Popular genres (>100 games):")
print(popular_genre_list)

# 9. Rolling and Expanding Windows
print(f"\nüìà 9. Time Series Analysis:")
# Sales by year
yearly_sales = game_df_clean.groupby('Year')['Global_Sales'].sum().sort_index()
print("Total sales by year (last 10 years):")
print(yearly_sales.tail(10))

# 3-year rolling average
yearly_sales_rolling = yearly_sales.rolling(window=3).mean()
print(f"\n3-year rolling average (last 10 years):")
print(yearly_sales_rolling.tail(10).round(3))

print(f"\n‚úÖ Grouping and aggregation completed!")

## 8Ô∏è‚É£ Data Transformation and Feature Engineering
Advanced data transformation, merging, and feature creation techniques

In [None]:
# Data Transformation and Feature Engineering
print("üîß DATA TRANSFORMATION AND FEATURE ENGINEERING")
print("="*50)

# 1. String Transformations
print("üî§ 1. String Transformations:")
# Publisher name length
game_df_clean['Publisher_Length'] = game_df_clean['Publisher'].str.len()
print(f"Added Publisher_Length column")

# Extract first word of game name
game_df_clean['Name_First_Word'] = game_df_clean['Name'].str.split().str[0]
print(f"Extracted first word of game names")

# Check for sequels (containing numbers)
game_df_clean['Is_Sequel'] = game_df_clean['Name'].str.contains(r'\d', na=False)
sequel_count = game_df_clean['Is_Sequel'].sum()
print(f"Games that are sequels (contain numbers): {sequel_count}")

# 2. Numerical Transformations
print(f"\nüî¢ 2. Numerical Transformations:")
# Log transformation for skewed data
game_df_clean['Log_Global_Sales'] = np.log1p(game_df_clean['Global_Sales'])
print(f"Applied log transformation to Global_Sales")

# Standardization (Z-score)
from scipy import stats
game_df_clean['Sales_Z_Score'] = stats.zscore(game_df_clean['Global_Sales'])
print(f"Added standardized sales scores")

# Min-max scaling
min_sales = game_df_clean['Global_Sales'].min()
max_sales = game_df_clean['Global_Sales'].max()
game_df_clean['Sales_Scaled'] = (game_df_clean['Global_Sales'] - min_sales) / (max_sales - min_sales)
print(f"Added min-max scaled sales")

# 3. Binning and Categorization
print(f"\nüìä 3. Binning and Categorization:")
# Sales performance tiers
game_df_clean['Performance_Tier'] = pd.cut(
    game_df_clean['Global_Sales'], 
    bins=[0, 0.1, 0.5, 1, 5, 100], 
    labels=['Poor', 'Below Average', 'Average', 'Good', 'Excellent']
)
print("Performance Tier distribution:")
print(game_df_clean['Performance_Tier'].value_counts())

# Year decade
game_df_clean['Decade'] = (game_df_clean['Year'] // 10) * 10
print(f"\nDecade distribution:")
print(game_df_clean['Decade'].value_counts().sort_index())

# 4. Regional Analysis Features
print(f"\nüåç 4. Regional Analysis Features:")
# Regional dominance
regional_cols = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
game_df_clean['Dominant_Region'] = game_df_clean[regional_cols].idxmax(axis=1)
game_df_clean['Dominant_Region'] = game_df_clean['Dominant_Region'].str.replace('_Sales', '')

print("Regional dominance distribution:")
print(game_df_clean['Dominant_Region'].value_counts())

# Regional diversity (how evenly distributed sales are)
def calculate_diversity(row):
    sales = [row['NA_Sales'], row['EU_Sales'], row['JP_Sales'], row['Other_Sales']]
    total = sum(sales)
    if total == 0:
        return 0
    proportions = [s/total for s in sales]
    # Calculate entropy as diversity measure
    entropy = -sum(p * np.log(p) if p > 0 else 0 for p in proportions)
    return entropy

game_df_clean['Regional_Diversity'] = game_df_clean.apply(calculate_diversity, axis=1)
print(f"Average regional diversity: {game_df_clean['Regional_Diversity'].mean():.3f}")

# 5. Ranking Features
print(f"\nüèÜ 5. Ranking Features:")
# Rank within genre
game_df_clean['Genre_Rank'] = game_df_clean.groupby('Genre')['Global_Sales'].rank(method='dense', ascending=False)

# Rank within publisher
game_df_clean['Publisher_Rank'] = game_df_clean.groupby('Publisher')['Global_Sales'].rank(method='dense', ascending=False)

# Show top games in each genre
top_by_genre = game_df_clean[game_df_clean['Genre_Rank'] == 1][['Name', 'Genre', 'Global_Sales']].sort_values('Global_Sales', ascending=False)
print("Top game in each genre:")
print(top_by_genre.head(10).to_string())

# 6. Date/Time Features
print(f"\nüìÖ 6. Time-based Features:")
# Years since release
current_year = 2016  # Dataset goes up to 2016
game_df_clean['Years_Since_Release'] = current_year - game_df_clean['Year']

# Gaming generation (approximate)
def get_generation(year):
    if year < 1985:
        return "Pre-NES"
    elif year < 1990:
        return "NES Era"
    elif year < 1995:
        return "16-bit Era"
    elif year < 2000:
        return "32/64-bit Era"
    elif year < 2005:
        return "6th Gen"
    elif year < 2010:
        return "7th Gen"
    else:
        return "8th Gen"

game_df_clean['Console_Generation'] = game_df_clean['Year'].apply(get_generation)
print("Console generation distribution:")
print(game_df_clean['Console_Generation'].value_counts())

# 7. Interaction Features
print(f"\n‚ö° 7. Interaction Features:")
# Publisher-Genre interaction
game_df_clean['Publisher_Genre'] = game_df_clean['Publisher'] + "_" + game_df_clean['Genre']

# Platform family
def get_platform_family(platform):
    nintendo_platforms = ['NES', 'SNES', 'N64', 'GC', 'Wii', 'WiiU', 'DS', '3DS', 'GB', 'GBA']
    sony_platforms = ['PS', 'PS2', 'PS3', 'PS4', 'PSP', 'PSV']
    microsoft_platforms = ['XB', 'X360', 'XOne']
    
    if platform in nintendo_platforms:
        return 'Nintendo'
    elif platform in sony_platforms:
        return 'Sony'
    elif platform in microsoft_platforms:
        return 'Microsoft'
    else:
        return 'Other'

game_df_clean['Platform_Family'] = game_df_clean['Platform'].apply(get_platform_family)
print("Platform family distribution:")
print(game_df_clean['Platform_Family'].value_counts())

# 8. Feature Summary
print(f"\nüìã 8. Feature Engineering Summary:")
new_features = [
    'Publisher_Length', 'Name_First_Word', 'Is_Sequel', 'Log_Global_Sales',
    'Sales_Z_Score', 'Sales_Scaled', 'Performance_Tier', 'Decade',
    'Dominant_Region', 'Regional_Diversity', 'Genre_Rank', 'Publisher_Rank',
    'Years_Since_Release', 'Console_Generation', 'Publisher_Genre', 'Platform_Family'
]

print(f"Created {len(new_features)} new features:")
for feature in new_features:
    print(f"  ‚úÖ {feature}")

print(f"\nFinal dataset shape: {game_df_clean.shape}")
print(f"‚úÖ Data transformation completed!")

## 9Ô∏è‚É£ Data Visualization with Pandas
Creating comprehensive visualizations using pandas plotting capabilities

In [None]:
# Data Visualization with Pandas
print("üìä DATA VISUALIZATION WITH PANDAS")
print("="*50)

# Set style for better plots
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)

# 1. Basic Line Plots
print("üìà 1. Line Plots:")
# Sales trend over years
yearly_sales = game_df_clean.groupby('Year')['Global_Sales'].sum()
plt.figure(figsize=(12, 6))
yearly_sales.plot(kind='line', title='Global Video Game Sales Over Time', 
                  xlabel='Year', ylabel='Total Sales (millions)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Multiple lines for regional sales
regional_yearly = game_df_clean.groupby('Year')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum()
plt.figure(figsize=(12, 6))
regional_yearly.plot(kind='line', title='Regional Sales Trends Over Time')
plt.xlabel('Year')
plt.ylabel('Sales (millions)')
plt.legend(title='Region')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 2. Bar Charts
print(f"\nüìä 2. Bar Charts:")
# Top genres by sales
top_genres = game_df_clean.groupby('Genre')['Global_Sales'].sum().nlargest(10)
plt.figure(figsize=(12, 6))
top_genres.plot(kind='bar', title='Top 10 Genres by Total Sales', 
                xlabel='Genre', ylabel='Total Sales (millions)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Horizontal bar chart for publishers
top_publishers = game_df_clean.groupby('Publisher')['Global_Sales'].sum().nlargest(10)
plt.figure(figsize=(12, 8))
top_publishers.plot(kind='barh', title='Top 10 Publishers by Total Sales')
plt.xlabel('Total Sales (millions)')
plt.tight_layout()
plt.show()

# 3. Histograms
print(f"\nüìä 3. Histograms:")
# Distribution of global sales
plt.figure(figsize=(12, 6))
game_df_clean['Global_Sales'].plot(kind='hist', bins=50, alpha=0.7, 
                                   title='Distribution of Global Sales')
plt.xlabel('Global Sales (millions)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Log-transformed sales distribution
plt.figure(figsize=(12, 6))
game_df_clean['Log_Global_Sales'].plot(kind='hist', bins=50, alpha=0.7,
                                       title='Distribution of Log-Transformed Sales')
plt.xlabel('Log(Global Sales + 1)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# 4. Box Plots
print(f"\nüì¶ 4. Box Plots:")
# Sales by platform family
plt.figure(figsize=(12, 6))
game_df_clean.boxplot(column='Global_Sales', by='Platform_Family', 
                     figsize=(12, 6))
plt.title('Sales Distribution by Platform Family')
plt.ylabel('Global Sales (millions)')
plt.tight_layout()
plt.show()

# Sales by performance tier
plt.figure(figsize=(12, 6))
game_df_clean.boxplot(column='Global_Sales', by='Performance_Tier',
                     figsize=(12, 6))
plt.title('Sales Distribution by Performance Tier')
plt.ylabel('Global Sales (millions)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 5. Scatter Plots
print(f"\nüîç 5. Scatter Plots:")
# Create sample for better visualization
sample_games = game_df_clean.sample(n=1000, random_state=42)

plt.figure(figsize=(12, 6))
sample_games.plot(kind='scatter', x='Year', y='Global_Sales', alpha=0.6,
                  title='Game Sales vs Release Year')
plt.xlabel('Release Year')
plt.ylabel('Global Sales (millions)')
plt.tight_layout()
plt.show()

# 6. Pie Charts
print(f"\nü•ß 6. Pie Charts:")
# Platform family market share
platform_share = game_df_clean.groupby('Platform_Family')['Global_Sales'].sum()
plt.figure(figsize=(10, 8))
platform_share.plot(kind='pie', autopct='%1.1f%%', 
                    title='Market Share by Platform Family')
plt.ylabel('')  # Remove y-label for pie charts
plt.tight_layout()
plt.show()

# Genre distribution by count
genre_counts = game_df_clean['Genre'].value_counts()
plt.figure(figsize=(12, 8))
genre_counts.plot(kind='pie', autopct='%1.1f%%',
                 title='Game Distribution by Genre')
plt.ylabel('')
plt.tight_layout()
plt.show()

# 7. Area Plots
print(f"\nüèîÔ∏è 7. Area Plots:")
# Stacked area plot for regional sales over time
plt.figure(figsize=(12, 6))
regional_yearly.plot(kind='area', stacked=True, alpha=0.7,
                    title='Regional Sales Evolution (Stacked)')
plt.xlabel('Year')
plt.ylabel('Sales (millions)')
plt.legend(title='Region')
plt.tight_layout()
plt.show()

# 8. Hexbin Plots
print(f"\n‚¨° 8. Hexbin Plots:")
# For high-density scatter plots
plt.figure(figsize=(12, 6))
game_df_clean.plot(kind='hexbin', x='Year', y='Global_Sales', gridsize=20,
                   title='Sales Density by Year (Hexbin)')
plt.xlabel('Year')
plt.ylabel('Global Sales (millions)')
plt.tight_layout()
plt.show()

# 9. Subplots with pandas
print(f"\nüìä 9. Multiple Subplots:")
# Create subplot grid
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Sales by decade
decade_sales = game_df_clean.groupby('Decade')['Global_Sales'].sum()
decade_sales.plot(kind='bar', ax=axes[0,0], title='Sales by Decade')
axes[0,0].set_xlabel('Decade')
axes[0,0].tick_params(axis='x', rotation=45)

# Console generation distribution
gen_counts = game_df_clean['Console_Generation'].value_counts()
gen_counts.plot(kind='pie', ax=axes[0,1], title='Games by Console Generation')
axes[0,1].set_ylabel('')

# Regional diversity histogram
game_df_clean['Regional_Diversity'].plot(kind='hist', bins=30, ax=axes[1,0],
                                         title='Regional Diversity Distribution')
axes[1,0].set_xlabel('Diversity Score')

# Performance tier counts
perf_counts = game_df_clean['Performance_Tier'].value_counts()
perf_counts.plot(kind='bar', ax=axes[1,1], title='Games by Performance Tier')
axes[1,1].set_xlabel('Performance Tier')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Data visualization completed!")

## üîü Advanced Analysis and Insights
Complex analytical operations, time series analysis, and business insights

In [None]:
# Advanced Analysis and Insights
print("üéØ ADVANCED ANALYSIS AND INSIGHTS")
print("="*50)

# 1. Time Series Analysis
print("üìà 1. Advanced Time Series Analysis:")
# Create proper time series index
yearly_data = game_df_clean.groupby('Year').agg({
    'Global_Sales': ['sum', 'mean', 'count'],
    'NA_Sales': 'sum',
    'EU_Sales': 'sum', 
    'JP_Sales': 'sum'
}).round(3)

# Flatten column names
yearly_data.columns = ['_'.join(col).strip() for col in yearly_data.columns]
print("Yearly gaming industry metrics:")
print(yearly_data.tail(10))

# Calculate year-over-year growth
yearly_sales = game_df_clean.groupby('Year')['Global_Sales'].sum()
yoy_growth = yearly_sales.pct_change() * 100
print(f"\nYear-over-year growth rates (last 10 years):")
print(yoy_growth.tail(10).round(2))

# Market volatility (standard deviation of yearly growth)
volatility = yoy_growth.std()
print(f"Market volatility (YoY growth std): {volatility:.2f}%")

# 2. Market Share Analysis
print(f"\nüè¢ 2. Market Share Analysis:")
# Publisher market dominance over time
top_publishers = game_df_clean['Publisher'].value_counts().head(5).index

publisher_yearly = game_df_clean[game_df_clean['Publisher'].isin(top_publishers)].groupby(['Year', 'Publisher'])['Global_Sales'].sum().unstack(fill_value=0)
market_share = publisher_yearly.div(publisher_yearly.sum(axis=1), axis=0) * 100

print("Market share evolution for top publishers (last 5 years):")
print(market_share.tail().round(2))

# Platform lifecycle analysis
platform_lifecycle = game_df_clean.groupby(['Platform', 'Year'])['Global_Sales'].sum().unstack(fill_value=0)
print(f"\nPlatform with longest lifecycle:")
platform_years = (platform_lifecycle > 0).sum(axis=1).sort_values(ascending=False)
print(platform_years.head(10))

# 3. Regional Market Analysis
print(f"\nüåç 3. Regional Market Dynamics:")
# Calculate regional preferences by genre
regional_genre = game_df_clean.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum()
regional_genre_pct = regional_genre.div(regional_genre.sum(axis=0), axis=1) * 100

print("Regional genre preferences (% of regional market):")
print(regional_genre_pct.round(2))

# Find most region-specific genres
regional_variance = regional_genre_pct.var(axis=1).sort_values(ascending=False)
print(f"\nMost region-specific genres (highest variance):")
print(regional_variance.head())

# 4. Success Factor Analysis
print(f"\nüèÜ 4. Success Factor Analysis:")
# Analyze what makes games successful
high_performers = game_df_clean[game_df_clean['Performance_Tier'] == 'Excellent']
low_performers = game_df_clean[game_df_clean['Performance_Tier'] == 'Poor']

print(f"High performers: {len(high_performers)} games")
print(f"Low performers: {len(low_performers)} games")

# Compare characteristics
print(f"\nSuccess factors comparison:")
print(f"Average publisher length - High: {high_performers['Publisher_Length'].mean():.1f}, Low: {low_performers['Publisher_Length'].mean():.1f}")
print(f"Sequel rate - High: {(high_performers['Is_Sequel'].sum()/len(high_performers)*100):.1f}%, Low: {(low_performers['Is_Sequel'].sum()/len(low_performers)*100):.1f}%")
print(f"Regional diversity - High: {high_performers['Regional_Diversity'].mean():.3f}, Low: {low_performers['Regional_Diversity'].mean():.3f}")

# Genre success rates
genre_success = game_df_clean.groupby('Genre')['Performance_Tier'].apply(lambda x: (x == 'Excellent').sum() / len(x) * 100).sort_values(ascending=False)
print(f"\nGenre success rates (% excellent games):")
print(genre_success.round(2))

# 5. Trend Analysis
print(f"\nüìä 5. Industry Trend Analysis:")
# Gaming industry maturity indicators
games_per_year = game_df_clean.groupby('Year').size()
unique_publishers_per_year = game_df_clean.groupby('Year')['Publisher'].nunique()
avg_sales_per_game = yearly_sales / games_per_year

print("Industry maturity metrics:")
print(f"Games released (last 10 years): {games_per_year.tail(10).to_dict()}")
print(f"Unique publishers (last 10 years): {unique_publishers_per_year.tail(10).to_dict()}")
print(f"Avg sales per game (last 10 years): {avg_sales_per_game.tail(10).round(3).to_dict()}")

# Platform generation transitions
platform_transitions = game_df_clean.groupby(['Console_Generation', 'Year']).size().unstack(fill_value=0)
print(f"\nConsole generation transitions:")
print(platform_transitions.tail())

# 6. Predictive Insights
print(f"\nüîÆ 6. Predictive Insights:")
# Calculate momentum indicators
recent_years = game_df_clean[game_df_clean['Year'] >= 2010]

# Publisher momentum (recent growth)
publisher_momentum = recent_years.groupby(['Publisher', 'Year'])['Global_Sales'].sum().unstack(fill_value=0)
publisher_growth = publisher_momentum.apply(lambda x: x.corr(pd.Series(range(len(x)))), axis=1).sort_values(ascending=False)

print("Publishers with strongest growth momentum (2010+):")
print(publisher_growth.head(10).round(3))

# Genre lifecycle stages
genre_age = game_df_clean.groupby('Genre')['Year'].agg(['min', 'max', 'mean'])
genre_age['lifespan'] = genre_age['max'] - genre_age['min']
genre_age['maturity'] = 2016 - genre_age['mean']  # Years since average release

print(f"\nGenre lifecycle analysis:")
print(genre_age.sort_values('maturity').round(1))

# 7. Business Intelligence Summary
print(f"\nüíº 7. Business Intelligence Summary:")
total_market_size = game_df_clean['Global_Sales'].sum()
total_games = len(game_df_clean)
avg_game_sales = total_market_size / total_games

print(f"üìä MARKET OVERVIEW:")
print(f"  Total market size: {total_market_size:.1f} million units")
print(f"  Total games analyzed: {total_games:,}")
print(f"  Average game sales: {avg_game_sales:.3f} million")
print(f"  Market leaders: {', '.join(game_df_clean['Publisher'].value_counts().head(3).index)}")
print(f"  Dominant genres: {', '.join(game_df_clean['Genre'].value_counts().head(3).index)}")
print(f"  Platform variety: {game_df_clean['Platform'].nunique()} different platforms")

# Key insights
top_game = game_df_clean.loc[game_df_clean['Global_Sales'].idxmax()]
print(f"\nüèÜ KEY INSIGHTS:")
print(f"  Best-selling game: {top_game['Name']} ({top_game['Global_Sales']:.2f}M)")
print(f"  Most productive year: {yearly_sales.idxmax()} ({yearly_sales.max():.1f}M total sales)")
print(f"  Most diverse publisher: {game_df_clean.groupby('Publisher')['Genre'].nunique().idxmax()}")
print(f"  Peak gaming era: {game_df_clean['Console_Generation'].mode().iloc[0]}")

print(f"\n‚úÖ Advanced analysis completed!")
print(f"\nüéâ PANDAS LAB COMPLETE - ALL FUNCTIONS DEMONSTRATED! üéâ")