In [1]:
# Import Dependencies
import pandas as pd
import os
import json
import numpy as np
from collections import OrderedDict

In [2]:
# Create reference to json file
json_file = open('purchase_data.json', 'r', encoding = 'utf-8')
purchases = json.load(json_file)
json_file.close()

In [3]:
df = pd.DataFrame(purchases)

In [4]:
# Player Count - Total number of players
total_players = len(df['SN'].unique())
total_players_df = pd.DataFrame({"Total Players" : total_players}, index = [0])
total_players_df

Unnamed: 0,Total Players
0,573


In [5]:
#Purchasing Analysis (Total)

#  Number of Unique Items
#  Average Purchase Price
#  Total Number of Purchases
#  Total Revenue

# Create data frame to show show purchasing analysis by total. 
purchasing_total_df = pd.DataFrame({'Number of Unique Items' : len(df['Item ID'].unique()),
                                   'Average Purchase Price' : df['Price'].mean(),
                                   'Total Number of Purchases': df['SN'].count(),
                                   'Total Revenue' : df['Price'].sum()}, index = [0])

# Reorder columns as listed in instruction
purchasing_total_df = purchasing_total_df[['Number of Unique Items', 'Average Purchase Price', 
                                           'Total Number of Purchases', 'Total Revenue']]

# Format all pricing with '$' and two digit decimal places.
purchasing_total_df['Average Purchase Price'] = purchasing_total_df['Average Purchase Price'].map("${:,.2f}".format)
purchasing_total_df['Total Revenue'] = purchasing_total_df['Total Revenue'].map("${:,.2f}".format)

purchasing_total_df

Unnamed: 0,Number of Unique Items,Average Purchase Price,Total Number of Purchases,Total Revenue
0,183,$2.93,780,"$2,286.33"


In [6]:
# Gender Demographics

#   Percentage and Count of Male Players
#   Percentage and Count of Female Players
#   Percentage and Count of Other / Non-Disclosed

#  Create new data frame which includes unique players.
unique_df = df.drop_duplicates(['SN'])

# Get count of gender
groupby_gender = pd.DataFrame(unique_df["Gender"].value_counts())

groupby_gender.reset_index(inplace=True)
groupby_gender.columns=["Gender", "Gender Count"]

# Calculate percentage of all gender and reformat to as percentage value.
groupby_gender["% Gender"]=  groupby_gender["Gender Count"] / total_players * 100
groupby_gender["% Gender"] = groupby_gender["% Gender"].map("{0:,.2f}%".format)
groupby_gender

Unnamed: 0,Gender,Gender Count,% Gender
0,Male,465,81.15%
1,Female,100,17.45%
2,Other / Non-Disclosed,8,1.40%


In [7]:
# Purchasing Analysis (Gender)

#The below each broken by gender
#  Purchase Count
#  Average Purchase Price
#  Total Purchase Value
#  Normalized Totals = Total Purchase Value / Purchase Count by Gender
by_Gender = df.groupby(["Gender"])
by_Gender.head()

# Create a data frame to show purchasing analysis by gender.  Note: Each column was added to the data frame separately 
# so the columns are in the proper order.
purchase_by_gender_df = pd.DataFrame({'Purchase Count' : by_Gender['Item ID'].count()})
purchase_by_gender_df["Average Purchase Price"] =  by_Gender["Price"].mean()
purchase_by_gender_df["Total Purchase Value"] = by_Gender["Price"].sum()
purchase_by_gender_df["Normalized Totals"] = by_Gender["Price"].sum() / by_Gender["Item ID"].count()

# Format all pricing with '$' and two digit decimal places.
purchase_by_gender_df['Average Purchase Price'] = purchase_by_gender_df['Average Purchase Price'].map("${:,.2f}".format)
purchase_by_gender_df['Total Purchase Value'] = purchase_by_gender_df['Total Purchase Value'].map("${:,.2f}".format)
purchase_by_gender_df['Normalized Totals'] = purchase_by_gender_df['Normalized Totals'].map("${:,.2f}".format)

purchase_by_gender_df

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Normalized Totals
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,136,$2.82,$382.91,$2.82
Male,633,$2.95,"$1,867.68",$2.95
Other / Non-Disclosed,11,$3.25,$35.74,$3.25


In [8]:
# Age Demographics

# The below each broken into bins of 4 years (i.e. <10, 10-14, 15-19, etc.)
#   Purchase Count
#   Average Purchase Price
#   Total Purchase Value
#   Normalized Totals = Total Purchase Value / Purchase Count

# Create the bins in which Data will be held
# Bins are <10, 10-14, 15-19, 
bins = [0, 9, 14, 19, 23, 27, 31, 100]

# Create the names for the four bins by age range
group_names = ['<10', '10-14', '15-19', '20-23', '24-27', '28-31', '>31']

# Add age range column to entire list of purchases.
df["Age Range"] = pd.cut(df["Age"], bins, labels=group_names)

# Group data frame by age range
by_ageRange = df.groupby(["Age Range"])

# Create data frame to show age demographics.  Note: Again, each column is added separately to retain correct order.
ageRange_df = pd.DataFrame({'Purchase Count' : by_ageRange["Item ID"].count()})
ageRange_df["Average Purchase Price"] = by_ageRange["Price"].mean()
ageRange_df["Total Purchase Value"] = by_ageRange["Price"].sum()
ageRange_df["Normalized Totals"] = by_ageRange["Price"].sum() / by_ageRange["Item ID"].count()

# Format all pricing with '$' and two digit decimal places.
ageRange_df["Average Purchase Price"] = ageRange_df["Average Purchase Price"].map("${:,.2f}".format)
ageRange_df["Total Purchase Value"] = ageRange_df["Total Purchase Value"].map("${:,.2f}".format)
ageRange_df["Normalized Totals"] = ageRange_df["Normalized Totals"].map("${:,.2f}".format)

ageRange_df

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Normalized Totals
Age Range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<10,28,$2.98,$83.46,$2.98
10-14,35,$2.77,$96.95,$2.77
15-19,133,$2.91,$386.42,$2.91
20-23,266,$2.88,$765.31,$2.88
24-27,169,$3.02,$510.02,$3.02
28-31,60,$2.96,$177.40,$2.96
>31,89,$3.00,$266.77,$3.00


In [9]:
#Top Spenders
#
#  Identify the the top 5 spenders in the game by total purchase value, then list (in a table):
#  SN
#  Purchase Count
#  Average Purchase Price
#  Total Purchase Value

# Group by player and total spent.
spenders = pd.DataFrame(df.groupby('SN')['Price'].sum())

# Grab only the top 5.
top_spenders = spenders.sort_values('Price', ascending=False).head(5)

# Create list of the top 5 players' names.
top_spenders = top_spenders.drop('Price', 1)
top_spenders.reset_index()
top_spender1 = top_spenders.iloc[0].name
top_spender2 = top_spenders.iloc[1].name
top_spender3 = top_spenders.iloc[2].name
top_spender4 = top_spenders.iloc[3].name
top_spender5 = top_spenders.iloc[4].name

top_spender_list = [top_spender1, top_spender2, top_spender3, top_spender4, top_spender5]

# Select all purchases made by the top spenders within the entire list of purchases and 
# save in a new data frame for later analysis.
top_spender_df = pd.DataFrame(df[df['SN'].isin(top_spender_list)])

# Group new data frame by player's name.
by_topSpender = top_spender_df.groupby(['SN'])

# Create new data frame for top spenders by purchase count, average purchase price, and total purchase value.
topSpender_df = pd.DataFrame({'Purchase Count' : by_topSpender['Item ID'].count(),
                             'Average Purchase Price' : by_topSpender['Price'].mean(),
                             'Total Purchase Value' : by_topSpender['Price'].sum()})

# Reorder columns according to instruction.
topSpender_df = topSpender_df[['Purchase Count', 'Average Purchase Price', 'Total Purchase Value']]

# Sort 'Total Purchase Value' by descending order.
topSpender_df.sort_values('Total Purchase Value', inplace=True, ascending=False)

# Format all pricing with '$' and two digit decimal places.
topSpender_df["Average Purchase Price"] = topSpender_df["Average Purchase Price"].map("${:,.2f}".format)
topSpender_df["Total Purchase Value"] = topSpender_df["Total Purchase Value"].map("${:,.2f}".format)

topSpender_df

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Undirrala66,5,$3.41,$17.06
Saedue76,4,$3.39,$13.56
Mindimnya67,4,$3.18,$12.74
Haellysu29,3,$4.24,$12.73
Eoda93,3,$3.86,$11.58


In [10]:
#Most Popular Items

#Identify the 5 most popular items by purchase count, then list (in a table):
#  Item ID
#  Item Name
#  Purchase Count
#  Item Price
#  Total Purchase Value

# Group by item and purchase count.
pop_items = pd.DataFrame(df.groupby('Item ID')['Item Name'].count())

# Grab only the top 5.
top_items = pop_items.sort_values('Item Name', ascending=False).head(5)

# Create list of the top 5 items.
top_items = top_items.drop('Item Name', 1)
top_items.reset_index()
top_item1 = top_items.iloc[0].name
top_item2 = top_items.iloc[1].name
top_item3 = top_items.iloc[2].name
top_item4 = top_items.iloc[3].name
top_item5 = top_items.iloc[4].name

top_item_list = [top_item1, top_item2, top_item3, top_item4, top_item5]

# Select all purchases with the most popular items by purchase count within the entire list of purchases and 
# save in a new data frame for later analysis.
top_item_df = pd.DataFrame(df[df['Item ID'].isin(top_item_list)])

# Create list of the 5 most popular item names.
item1_name = top_item_df.loc[top_item_df['Item ID'] == top_item1,'Item Name'].iloc[0]
item2_name = top_item_df.loc[top_item_df['Item ID'] == top_item2,'Item Name'].iloc[0]
item3_name = top_item_df.loc[top_item_df['Item ID'] == top_item3,'Item Name'].iloc[0]
item4_name = top_item_df.loc[top_item_df['Item ID'] == top_item4,'Item Name'].iloc[0]
item5_name = top_item_df.loc[top_item_df['Item ID'] == top_item5,'Item Name'].iloc[0]

itemName_list = (item1_name, item2_name, item3_name, item4_name, item5_name)

# Group new data frame by the 5 most popular items.
by_topItem = top_item_df.groupby(['Item ID'])

# Create new data frame for the 5 most popular items by purchase count, average purchase price, and total purchase value.
# Called function 'OrderedDict' from collections package to preserve order in data frame. No need to reorder!
topItem_df = pd.DataFrame( OrderedDict({'Item Name' : itemName_list,
                           'Purchase Count' : by_topItem['Item ID'].count(),
                           'Item Price' : by_topItem['Price'].mean(),
                           'Total Purchase Value' : by_topItem['Price'].sum()
                          }))

# Sort 'Purchase Count' by descending order.
topItem_df.sort_values('Purchase Count', inplace=True, ascending=False)

# Format all pricing with '$' and two digit decimal places.
topItem_df["Item Price"] = topItem_df["Item Price"].map("${:,.2f}".format)
topItem_df["Total Purchase Value"] = topItem_df["Total Purchase Value"].map("${:,.2f}".format)

topItem_df

Unnamed: 0_level_0,Item Name,Purchase Count,Item Price,Total Purchase Value
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39,Trickster,11,$2.35,$25.85
84,Woeful Adamantite Claymore,11,$2.23,$24.53
13,"Betrayal, Whisper of Grieving Widows",9,$1.49,$13.41
31,Arcane Gem,9,$2.07,$18.63
175,Serenity,9,$1.24,$11.16


In [11]:
# Most Profitable Items

# Identify the 5 most profitable items by total purchase value, then list (in a table):
#   Item ID
#   Item Name
#   Purchase Count
#   Item Price

# Group by item and total purchase value.
total_purchase_value = pd.DataFrame(df.groupby('Item ID')['Price'].sum())

# Grab only the top 5 using nlargest function rather than sort descending then selecting the first 5 rows.
profitable = total_purchase_value.nlargest(5, 'Price')
profitable

# Create list of the top 5 total purchase value
top_profit = profitable.drop('Price', 1)
top_profit.reset_index()
top_profit1 = profitable.iloc[0].name
top_profit2 = profitable.iloc[1].name
top_profit3 = profitable.iloc[2].name
top_profit4 = profitable.iloc[3].name
top_profit5 = profitable.iloc[4].name

top_profit_list = [top_profit1, top_profit2, top_profit3, top_profit4, top_profit5]

# Select all the 5 most profitable items by total purchase value within the entire list of purchases and 
# save in a new data frame for later analysis.
top_profit_df = pd.DataFrame(df[df['Item ID'].isin(top_profit_list)])

# Create list of the 5 most popular item names.
profit1_name = top_profit_df.loc[top_profit_df['Item ID'] == top_profit1,'Item Name'].iloc[0]
profit2_name = top_profit_df.loc[top_profit_df['Item ID'] == top_profit2,'Item Name'].iloc[0]
profit3_name = top_profit_df.loc[top_profit_df['Item ID'] == top_profit3,'Item Name'].iloc[0]
profit4_name = top_profit_df.loc[top_profit_df['Item ID'] == top_profit4,'Item Name'].iloc[0]
profit5_name = top_profit_df.loc[top_profit_df['Item ID'] == top_profit5,'Item Name'].iloc[0]

profitName_list = [profit1_name, profit2_name, profit3_name, profit4_name, profit5_name]
print  (profitName_list)
by_topProfit = top_profit_df.groupby(['Item ID'])

# Create new data frame for the 5 most profitable items showing item id,item name, purchase count, and item price.
# Called function 'OrderedDict' from collections package to preserve order in data frame. No need to reorder!
topProfit_df = pd.DataFrame(OrderedDict({'Item Name' : [profit1_name, profit2_name, profit3_name, profit4_name, profit5_name],
                           'Purchase Count' : by_topProfit['Item ID'].count(),
                           'Total Purchase Value' : by_topProfit['Price'].sum(),
                           'Item Price' : by_topProfit['Price'].mean()
                          }))

# Sort 'Total Purchase Value' by descending order.
topProfit_df.sort_values('Total Purchase Value', inplace=True, ascending=False)

# Format all pricing with '$' and two digit decimal places.
topProfit_df['Total Purchase Value'] = topProfit_df['Total Purchase Value'].map("${:,.2f}".format)
topProfit_df['Item Price'] = topProfit_df['Item Price'].map("${:,.2f}".format)

topProfit_df

['Retribution Axe', 'Spectral Diamond Doomblade', 'Orenmir', 'Singed Scalpel', 'Splitter, Foe Of Subtlety']


Unnamed: 0_level_0,Item Name,Purchase Count,Total Purchase Value,Item Price
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34,Spectral Diamond Doomblade,9,$37.26,$4.14
115,"Splitter, Foe Of Subtlety",7,$29.75,$4.25
32,Retribution Axe,6,$29.70,$4.95
103,Orenmir,6,$29.22,$4.87
107,Singed Scalpel,8,$28.88,$3.61
