# DSC550-T301 Data Mining 
## 1.2 Exercise: Exploring a Pandas Data Frame

### Import libraries

In [24]:
# Import libraries
import pandas as pd
import numpy as np

### Load the dataset as a Pandas data frame.

In [27]:
# Import data as a pandas data frame
df_games = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")

### Display the first ten rows of data.

In [30]:
print("First 10 rows:\n", df_games.head(10))

First 10 rows:
                         Name Platform  Year_of_Release         Genre  \
0                 Wii Sports      Wii           2006.0        Sports   
1          Super Mario Bros.      NES           1985.0      Platform   
2             Mario Kart Wii      Wii           2008.0        Racing   
3          Wii Sports Resort      Wii           2009.0        Sports   
4   Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing   
5                     Tetris       GB           1989.0        Puzzle   
6      New Super Mario Bros.       DS           2006.0      Platform   
7                   Wii Play      Wii           2006.0          Misc   
8  New Super Mario Bros. Wii      Wii           2009.0      Platform   
9                  Duck Hunt      NES           1984.0       Shooter   

  Publisher  NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  \
0  Nintendo     41.36     28.96      3.77         8.45         82.53   
1  Nintendo     29.08      3.58      6.81      

### Find the dimensions (number of rows and columns) in the data frame. 

In [33]:
dimensions = df_games.shape
print(f"Dimensions: {dimensions}")
print(f"Number of rows: {dimensions[0]}")
print(f"Number of columns: {dimensions[1]}")

Dimensions: (16719, 16)
Number of rows: 16719
Number of columns: 16


### What these numbers represent in context:

Number of rows: Represents the number of observations, samples, or records in the dataset

Number of columns: Represents the number of variables, features, or attributes for each observation

### Find the top five games by critic score.


In [54]:
df_games[['Name', 'Platform', 'Year_of_Release', 'Critic_Score']].sort_values(by= 'Critic_Score', axis = 0, ascending = False).head(5)

Unnamed: 0,Name,Platform,Year_of_Release,Critic_Score
227,Tony Hawk's Pro Skater 2,PS,2000.0,98.0
57,Grand Theft Auto IV,PS3,2008.0,98.0
51,Grand Theft Auto IV,X360,2008.0,98.0
5350,SoulCalibur,DC,1999.0,98.0
165,Grand Theft Auto V,XOne,2014.0,97.0


### Find the number of video games in the data frame in each genre.


In [91]:
df_games['Genre'].value_counts()

Genre
Action          3370
Sports          2348
Misc            1750
Role-Playing    1500
Shooter         1323
Adventure       1303
Racing          1249
Platform         888
Simulation       874
Fighting         849
Strategy         683
Puzzle           580
Name: count, dtype: int64

### Find the first five games in the data frame on the SNES platform.


In [88]:
# Find first 5 games using query
snes_games = df_games[['Name', 'Platform', 'Year_of_Release']].query('Platform == "SNES"')
first_five_snes = snes_games.head(5)

print(first_five_snes)

                                     Name Platform  Year_of_Release
18                      Super Mario World     SNES           1990.0
56                  Super Mario All-Stars     SNES           1993.0
71                    Donkey Kong Country     SNES           1994.0
76                       Super Mario Kart     SNES           1992.0
137  Street Fighter II: The World Warrior     SNES           1992.0


### Find the five publishers with the highest total global sales. Note: You will need to calculate the total global sales for each publisher to do this.


In [105]:
# Calculate global sales and find top 5 publishers
top_publishers = df_games.groupby('Publisher')['Global_Sales'].sum().sort_values(ascending = False).head(5)
print(top_publishers)

Publisher
Nintendo                       1788.81
Electronic Arts                1116.96
Activision                      731.16
Sony Computer Entertainment     606.48
Ubisoft                         471.61
Name: Global_Sales, dtype: float64


### Find the number NaN entries (missing data values) in each column.


In [41]:
# Check for missing data
df_games.isna().sum()

Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6769
dtype: int64

### Create a new column in the data frame that calculates the percentage of global sales from North America. Display the first five rows of the new data frame.


In [118]:
# Create the new column with the percentage of global sales from North America
df_games['NA_Sales_Percentage'] = (df_games['NA_Sales'] / df_games['Global_Sales']) * 100

# Display first five rows
print(df_games.head(5))

                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii           2006.0        Sports  Nintendo   
1         Super Mario Bros.      NES           1985.0      Platform  Nintendo   
2            Mario Kart Wii      Wii           2008.0        Racing  Nintendo   
3         Wii Sports Resort      Wii           2009.0        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN   

   Critic_Count User_Score  User_Count Developer Rating 

### Try to calculate the median user score of all the video games. You will likely run into an error because some of the user score entries are a non-numerical string that cannot be converted to a float. Find and replace this string with NaN and then calculate the median. Then, replace all NaN entries in the user score column with the median value.

In [141]:
# Replace strings with NaN
df_games['User_Score'] = pd.to_numeric(df_games['User_Score'], errors='coerce')

# Calculate mean
average_score = df_games['User_Score'].mean()
print("Mean user score: \n", average_score)

# Replace NaN with mean
df_games['User_Score'] = df_games['User_Score'].fillna(average_score)

# Check that there are no more NaN values
print(f"NaN values after replacement: {df_games['User_Score'].isna().sum()}")

Mean user score: 
 7.125046113306982
NaN values after replacement: 0


### Comments
<ul>
    <li> Critic_Score, Critic_Count, User_Score, User_Count and Rating are missing data
    <li>
<ul>