## <span style="color:green">Pre-Processing Steps</span>

In [1]:
#import pandas
import pandas as pd

In [3]:
#load the dataset
df = pd.read_csv('vgsales.csv')

#print dataset to view columns and values
print(df)

        Rank                                              Name Platform  \
0          1                                        Wii Sports      Wii   
1          2                                 Super Mario Bros.      NES   
2          3                                    Mario Kart Wii      Wii   
3          4                                 Wii Sports Resort      Wii   
4          5                          Pokemon Red/Pokemon Blue       GB   
...      ...                                               ...      ...   
16593  16596                Woody Woodpecker in Crazy Castle 5      GBA   
16594  16597                     Men in Black II: Alien Escape       GC   
16595  16598  SCORE International Baja 1000: The Official Game      PS2   
16596  16599                                        Know How 2       DS   
16597  16600                                  Spirits & Spells      GBA   

         Year         Genre   Publisher  NA_Sales  EU_Sales  JP_Sales  \
0      2006.0        Sport

## <span style="color:blue">Guiding Question 1</span>
Are all columns in the correct data type? If not, how can we convert them into the appropriate type?

In [6]:
#check type of columns with numeric values
print(df[['Rank', 'Year', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']].dtypes)

Rank              int64
Year            float64
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object


In [8]:
#rank is in the correct type because each Rank is a whole number with no decimals
#each sales value (NA_Sales, EU_Sales, JP_Sales, Other_Sales, Global_Sales) are all in decimal form, so the float type is appropriate

#the 'Year' column is in the wrong type and should be converted to an integer type since it is only showing non-decimal values

#ensure that all NA values are replaced with "0" to be able to adjust the type to integer
df['Year'] = df['Year'].fillna(0)

#convert 'Year' to integer type
df['Year'] = df['Year'].astype(int)

#Verify type change
print(df['Year'].dtype)

int32


## <span style="color:blue">Guiding Question 2</span>
Can we combine the North American sales (NA_Sales) & European sales (EU_Sales) column into one column by sum then group the dataset by Platform?

In [9]:
#create a new column within the dataset
df['NAEU_Sales'] = df['NA_Sales'] + df['EU_Sales']

#check the dataset to ensure our new column is added
print(df)

        Rank                                              Name Platform  Year  \
0          1                                        Wii Sports      Wii  2006   
1          2                                 Super Mario Bros.      NES  1985   
2          3                                    Mario Kart Wii      Wii  2008   
3          4                                 Wii Sports Resort      Wii  2009   
4          5                          Pokemon Red/Pokemon Blue       GB  1996   
...      ...                                               ...      ...   ...   
16593  16596                Woody Woodpecker in Crazy Castle 5      GBA  2002   
16594  16597                     Men in Black II: Alien Escape       GC  2003   
16595  16598  SCORE International Baja 1000: The Official Game      PS2  2008   
16596  16599                                        Know How 2       DS  2010   
16597  16600                                  Spirits & Spells      GBA  2003   

              Genre   Publi

In [11]:
#group by platform
grouped = df.groupby(['Platform'])['NAEU_Sales'].sum()
print(grouped)

Platform
2600     96.07
3DO       0.00
3DS     137.39
DC        7.12
DS      585.36
GB      162.14
GBA     262.79
GC      172.17
GEN      24.79
GG        0.00
N64     180.08
NES     147.09
NG        0.00
PC      232.96
PCFX      0.00
PS      550.11
PS2     923.13
PS3     735.97
PS4     220.50
PSP     177.24
PSV      32.53
SAT       1.26
SCD       1.36
SNES     80.27
TG16      0.00
WS        0.00
Wii     776.09
WiiU     62.55
X360    881.63
XB      247.64
XOne    128.84
Name: NAEU_Sales, dtype: float64


## <span style="color:blue">Guiding Question 3</span>
How can we manipulate the dataset to show the Top 25 games by North American sales (NA_Sales) with a release date of 2010 onwards?

In [14]:
#create a new dataframe that only contains records of games released in 2010 or later
df2010 = df[df['Year'] >= 2010]

In [15]:
#show the top 25 ranking games by North American sales (NA_Sales)
df2010.nlargest(25, 'NA_Sales')

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,NAEU_Sales
15,16,Kinect Adventures!,X360,2010,Misc,Microsoft Game Studios,14.97,4.94,0.24,1.67,21.82,19.91
31,32,Call of Duty: Black Ops,X360,2010,Shooter,Activision,9.67,3.73,0.11,1.13,14.64,13.4
23,24,Grand Theft Auto V,X360,2013,Action,Take-Two Interactive,9.63,5.31,0.06,1.38,16.38,14.94
29,30,Call of Duty: Modern Warfare 3,X360,2011,Shooter,Activision,9.03,4.28,0.13,1.32,14.76,13.31
35,36,Call of Duty: Black Ops II,X360,2012,Shooter,Activision,8.25,4.3,0.07,1.12,13.73,12.55
62,63,Halo: Reach,X360,2010,Shooter,Microsoft Game Studios,7.03,1.98,0.08,0.78,9.88,9.01
16,17,Grand Theft Auto V,PS3,2013,Action,Take-Two Interactive,7.01,9.27,0.97,4.14,21.4,16.28
61,62,Call of Duty: Ghosts,X360,2013,Shooter,Activision,6.72,2.63,0.04,0.82,10.21,9.35
65,66,Halo 4,X360,2012,Shooter,Microsoft Game Studios,6.63,2.36,0.04,0.73,9.76,8.99
60,61,Just Dance 3,Wii,2011,Misc,Ubisoft,6.05,3.15,0.0,1.07,10.26,9.2


In [16]:
#drop unecessary columns
df2010.nlargest(25, 'NA_Sales').loc[:, ['Name', 'Year', 'NA_Sales']]

Unnamed: 0,Name,Year,NA_Sales
15,Kinect Adventures!,2010,14.97
31,Call of Duty: Black Ops,2010,9.67
23,Grand Theft Auto V,2013,9.63
29,Call of Duty: Modern Warfare 3,2011,9.03
35,Call of Duty: Black Ops II,2012,8.25
62,Halo: Reach,2010,7.03
16,Grand Theft Auto V,2013,7.01
61,Call of Duty: Ghosts,2013,6.72
65,Halo 4,2012,6.63
60,Just Dance 3,2011,6.05


## <span style="color:blue">Guiding Question 4</span>
How can we rename the column headers to make them more digestable for the user?

In [22]:
#renaming columns
df.rename(columns={'Name': 'Game_Title', 'Year': 'Year_Released'}, inplace=True)

#verify changes
print(df)

        Rank                                        Game_Title Platform  \
0          1                                        Wii Sports      Wii   
1          2                                 Super Mario Bros.      NES   
2          3                                    Mario Kart Wii      Wii   
3          4                                 Wii Sports Resort      Wii   
4          5                          Pokemon Red/Pokemon Blue       GB   
...      ...                                               ...      ...   
16593  16596                Woody Woodpecker in Crazy Castle 5      GBA   
16594  16597                     Men in Black II: Alien Escape       GC   
16595  16598  SCORE International Baja 1000: The Official Game      PS2   
16596  16599                                        Know How 2       DS   
16597  16600                                  Spirits & Spells      GBA   

       Year_Released         Genre   Publisher  NA_Sales  EU_Sales  JP_Sales  \
0               200