## Data Analysis with Pandas

Author: Daniel Dills


In [208]:
import pandas as pd
import numpy as np

In [209]:
df = pd.read_csv("vgsales.csv")

In [210]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


### Which company is the most common video game publisher?

In [211]:
most_common_publisher = df["Publisher"].value_counts()
most_common_publisher.head(1)

Electronic Arts    1351
Name: Publisher, dtype: int64

### What’s the most common platform?

In [212]:
most_common_platform = df["Platform"].value_counts()
most_common_platform.head(1)

DS    2163
Name: Platform, dtype: int64

### What about the most common genre?


In [213]:
most_common_genre = df["Genre"].value_counts()
most_common_genre.head(1)

Action    3316
Name: Genre, dtype: int64

### What are the top 20 highest grossing games?

In [214]:
highest_grossing_games = df[["Name", "Global_Sales"]].sort_values("Global_Sales", ascending=False)
highest_grossing_games.head(20)

Unnamed: 0,Name,Global_Sales
0,Wii Sports,82.74
1,Super Mario Bros.,40.24
2,Mario Kart Wii,35.82
3,Wii Sports Resort,33.0
4,Pokemon Red/Pokemon Blue,31.37
5,Tetris,30.26
6,New Super Mario Bros.,30.01
7,Wii Play,29.02
8,New Super Mario Bros. Wii,28.62
9,Duck Hunt,28.31


### For North American video game sales, what’s the median?

In [215]:
na_median_sales = df["NA_Sales"].median()
print(na_median_sales)

0.08


### Provide a secondary output showing ten games surrounding the median sales output, assume that games with same median value are sorted in descending order


In [216]:
na_ten_median_sales = df[df["NA_Sales"] == na_median_sales]
na_ten_median_sales[["Rank","Name", "Platform", "NA_Sales"]].head(10)

Unnamed: 0,Rank,Name,Platform,NA_Sales
446,447,Dragon Warrior IV,NES,0.08
497,498,World Soccer Winning Eleven 7 International,PS2,0.08
1617,1619,Farming Simulator 2015,PC,0.08
1926,1928,Pro Evolution Soccer 2008,X360,0.08
2067,2069,Winning Eleven: Pro Evolution Soccer 2007 (All...,X360,0.08
2373,2375,Phantasy Star Portable 2,PSP,0.08
2579,2581,The Sims 2: Castaway,PSP,0.08
3186,3188,SingStar Queen,PS2,0.08
3503,3505,Top Spin 3,PS3,0.08
3703,3705,Sonic & All-Stars Racing Transformed,PS3,0.08


### For the top-selling game of all time, how many standard deviations above/below the mean are its sales for North America?

In [217]:
na_top_selling_game = df[["Name", "Platform", "NA_Sales", "Global_Sales"]].sort_values("Global_Sales", ascending=False)
na_top_selling_game.head(1)

Unnamed: 0,Name,Platform,NA_Sales,Global_Sales
0,Wii Sports,Wii,41.49,82.74


### Standard deviation for top selling game in North America


In [218]:
na_std = df["NA_Sales"].std()
print(na_std)


0.8166830292988796


### Standard deviations above/below the mean are its sales for North America?

In [219]:
# z-score measures exactly how many standard deviations above or below the mean a data point is.
max = df[["NA_Sales"]].max()
mean = df[["NA_Sales"]].mean()
result = (max-mean)/na_std
result

NA_Sales    50.478988
dtype: float64

### Nintendo Wii average number of global sales

In [220]:
wii_global_sales = df[df["Platform"] == "Wii"]["Global_Sales"].mean()
print(f"Wii avg Global Sales {wii_global_sales}.")

Wii avg Global Sales 0.6994037735849057.


### Other platform average global sales

In [221]:
non_wii_game_global_sales = df[df["Platform"] != "Wii"]["Global_Sales"].mean()
print(f"Other platform average global sales {non_wii_game_global_sales}.")

Other platform average global sales 0.5233896418516336.
