### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [32]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
file_to_load = "Resources/purchase_data.csv"

# Read Purchasing File and store into Pandas data frame
purchase_data_df = pd.read_csv(file_to_load)
# print((purchase_data_df.dtypes))  #Satisfactory

## Player Count

* Display the total number of players


In [33]:
purchase_data_df.head(20)
# purchase_data.describe()
# SN (screen name) will provide identifier for unique persons playing, and number of players.

# purchase_data_df["SN"].describe(), shows 576 unique
player_count = len(purchase_data_df["SN"].unique())
print(f" Total number of players = {player_count}")

 Total number of players = 576


## Purchasing Analysis (Total)

* Run basic calculations to obtain number of unique items, average price, etc.


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame


In [34]:
#Quick analysis of data frame
purchase_data_df.head(20)

#Retrieve data, and map to format
unique_items = len(purchase_data_df["Item Name"].unique())

av_purch_price = purchase_data_df["Price"].mean()
# no_purch = (purchase_data_df["Purchase ID"]).tail() # shows 779 + 1st purchase = 780 purchases
no_purch = len(purchase_data_df["Purchase ID"])
rev = purchase_data_df["Price"].sum()

#Create new dataframe for summary data to be presented 
purch_analysis_df = pd.DataFrame({"Number of Unique Items": [unique_items],
                                  "Average Price": [av_purch_price],
                                  "Number of Purchases":[no_purch],
                                  "Total Revenue": [rev]})

#Format each column using dict mapping
purch_analysis_df["Average Price"] = purch_analysis_df["Average Price"].map("${:.2f}".format)
purch_analysis_df["Total Revenue"] = purch_analysis_df["Total Revenue"].map("${:.2f}".format)

purch_analysis_df

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,$3.05,780,$2379.77


## Gender Demographics

* Percentage and Count of Male Players


* Percentage and Count of Female Players


* Percentage and Count of Other / Non-Disclosed




In [35]:
# purchase_data_df.head()
# count_gender = purchase_data_df["Gender"].value_counts()
# count_gender
# gender_summary = pd.DataFrame({"G"})

#Need SN and Gender columns, to obtain list of unique players and their gender
reduced_df = purchase_data_df[["SN","Gender","Purchase ID"]]

#Replace purchase ID with the number 1, for sum'ing later
# reduced_df['Purhcase ID'] = reduced_df['Purchase ID'].replace([:],'1')
reduced_df_SN = reduced_df.value_counts(subset=['SN','Gender'])     #Retuns 576 unqiue players, duplicates subtracted
gender_group = reduced_df_SN.groupby(["Gender"])
gender_compare = gender_group.count()
# print(type(gender_compare))     #Output is series

#Convert count series output into dataframe, set headers titles, sort values ascending order
final_compare_df = gender_group.count().rename_axis('Gender').reset_index(name='Total Count').sort_values(['Total Count'], ascending=False)
# final_compare_df2 = final_compare_df[["Gender","Total Count"]]  #Don't know how to drop index
final_compare_df3 = final_compare_df.set_index("Gender")

#Percentage calcs
#Sum population & create variable for total, calc every row against sum and create new column, format fro %2f
total = final_compare_df3["Total Count"].sum()
Percent = final_compare_df3["Total Count"]/total
final_compare_df3["Percentage of Players"] = Percent
final_compare_df3

Unnamed: 0_level_0,Total Count,Percentage of Players
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,484,0.840278
Female,81,0.140625
Other / Non-Disclosed,11,0.019097



## Purchasing Analysis (Gender)

* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. by gender




* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [82]:
#Method, (1) groupby on raw data, 3 times for 3 different functions (ie. count, mean, sum)
# (2) above outputs (series) converted to dfs individually, via .reset_index
# (3) merge dfs, final calc, format


#:::::: (1) & (2)::::::
# Retrieve Purchase count (by Gender)
gender_compare_count = gender_group.count()
gender_compare_count2 = gender_compare_count.reset_index()
gender_compare_count_rn = gender_compare_count2.rename(columns={"0":"Purchase Count"})
    # Reset index: to convert series to dataframe.
gender_compare_count_rn_reset = gender_compare_count_rn.reset_index()
final_gender_count = gender_compare_count_rn[["Gender","Purchase Count"]]

# Retrieve Ave purchase price (by Gender)
gender_compare_pricemean = gender_group.mean()
gender_compare_pricemean_rn = gender_compare_pricemean.rename(columns={"Price":"Average Purchase Price"})
# gender_compare_pricemean_rndol = gender_compare_pricemean_rn["Average Purchase Price"].map("${:.2f}".format)
    #Reset index: to convert series to datagrame.
final_avpurchprice = gender_compare_pricemean_rn.reset_index()

# Retrieve Total Purchase Value (by Gender)
gender_compare_totalpurch = gender_group.sum().rename(columns={"Price":"Total Purchase Value"})
    #Reset index: to convert series to datagrame.
final_totalrev = gender_compare_totalpurch.reset_index()

## Summary of info in dfs so far
# final_gender_count              # Gender, Purchase Count dataframe:
# final_avpurchprice              # Average Purchase Price
# final_totalrev                  # Total rev


#::::::::(3):::::::::
#This returns incorrect dataframes, don't understand why
# final_summary = pd.DataFrame({"Gender": [final_gender_count],
#                                             "Purchase Count": [final_gender_count],
#                                             "Average Purhcase Price": [final_avpurchprice],
#                                             "Total Purchase Value": [final_totalrev]})

#Merge first 2 of 3 dfs
merge_df = pd.merge(final_gender_count, final_avpurchprice, on="Gender", how="outer")

#Merge new df with 3rd initial df
final_merge_df = pd.merge(merge_df, final_totalrev, on="Gender", how="outer")
final_merge_df

#Calc av p/person
avgpp = final_merge_df["Total Purchase Value"]/final_merge_df["Purchase Count"]
final_merge_df["Avg Total Purchase per Person"] = avgpp
final_merge_df

# #Format each column using dict mapping
final_merge_df["Average Purchase Price"] = final_merge_df["Average Purchase Price"].map("${:.2f}".format)
final_merge_df["Total Purchase Value"] = final_merge_df["Total Purchase Value"].map("${:.2f}".format)
final_merge_df["Avg Total Purchase per Person"] = final_merge_df["Avg Total Purchase per Person"].map("${:.2f}".format)
final_merge_df


TypeError: rename() got an unexpected keyword argument 'columns'

## Age Demographics

* Establish bins for ages


* Categorize the existing players using the age bins. Hint: use pd.cut()


* Calculate the numbers and percentages by age group


* Create a summary data frame to hold the results


* Optional: round the percentage column to two decimal points


* Display Age Demographics Table


In [None]:
# purchase_data_df["Age"].describe(), although duplicates exist, max 45 min 7
# # Creating bins
bins = [0, 9.9, 13.99, 19.9, 24.9, 29.9, 34.9, 39.9, 49.9]

# Create labels for the bins
group_labels = ["<10", "10-14", "15-19","20-24","25-29","30-34","35-39", "40+"]


reduced_df = purchase_data_df[["SN","Age"]]
reduced_df

# reduced_df.reset_index()

# Drop duplicate SNs
dup_out = reduced_df.drop_duplicates(subset=['SN','Age'], keep="first")   #Returns 576, corresponds to prior calcs, correct
dup_out.dtypes

#Categorise players
dup_out["Ages"] = pd.cut(dup_out["Age"], bins, labels= group_labels)
dup_out.head()

#Create a GroupBy object based upon "Age"
dup_out_group = dup_out.groupby("Ages")

# Count how many fall within each bin by using count and print series
dup_out_group["SN"].count()

# Convert series back into df using reset_index
age_demo_df = dup_out_group["SN"].count().reset_index()
age_demo_df

# Calc % of players and format .00%
age_demo_df["Percentage of Players"] = age_demo_df["SN"] / age_demo_df["SN"].sum()
age_demo_df["Percentage of Players"] = age_demo_df["Percentage of Players"].astype(float).map("{:.2%}".format)

# Rename SN to Total Count
age_demo_df = age_demo_df.rename(columns={"SN": "Total Count"})
final_age_demo_df = age_demo_df.set_index("Ages")
final_age_demo_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Total Count,Percentage of Players
Ages,Unnamed: 1_level_1,Unnamed: 2_level_1
<10,17,2.95%
10-14,20,3.47%
15-19,109,18.92%
20-24,258,44.79%
25-29,77,13.37%
30-34,52,9.03%
35-39,31,5.38%
40+,12,2.08%


## Purchasing Analysis (Age)

* Bin the purchase_data data frame by age


* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. in the table below


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [62]:
# Slice data and bin by "Age", and place the data series into a new column of DF
purchase_data_df["Age Ranges"] = pd.cut(purchase_data_df["Age"], bins, labels=group_labels)
purchase_data_df

# GroupBy
purch_age_group = purchase_data_df.groupby("Age Ranges")

# Find purchase count per Age range (count 'Purchase ID')
pc_age_group = purch_age_group["Purchase ID"].count().reset_index()
# print(purch_age_group["Purchase ID"].count().reset_index())

# Find av purchase price per Age range (mean 'Price')
avp_age_group = purch_age_group["Price"].mean().reset_index()
# print(purch_age_group["Price"].mean().reset_index())

#Find purch total per Age range (sum 'Price) and convert to DF (via .reset_index)
pt_age_group = purch_age_group["Price"].sum().reset_index()
# print(purch_age_group["Price"].sum().reset_index())

# Create Summary data frame to hold results
final_summary = pd.merge(pc_age_group, avp_age_group, on="Age Ranges")
final_summary2 = pd.merge(final_summary, pt_age_group, on="Age Ranges")
final_summary2

# Rename columns
final_summary2 = final_summary2.rename(columns={"Purchase ID": "Purchase Count", 
                                                "Price_x": "Average Purchase Price", 
                                                "Price_y": "Total Purchase Value"})
final_summary2

# Calc Av Total Purch p/person
# final_summary2["Avg Total Purchase per Person"] = final_summary2["Total Purchase Value"] / final_summary2["Purchase Count"]
# final_summary2

# Format
final_summary2["Average Purchase Price"] = final_summary2["Average Purchase Price"].astype(float).map("${:,.2f}".format)
final_summary2["Total Purchase Value"] = final_summary2["Total Purchase Value"].astype(float).map("${:,.2f}".format)
# final_summary2["Avg Total Purchase per Person"] = final_summary2["Avg Total Purchase per Person"].astype(float).map("${:,.2f}".format)
final_summary2

final_summary4 = final_summary2.set_index("Age Ranges")
final_summary4

# Set index to 'Age Ranges'
# final_summary3 = final_summary2.set_index("Age Ranges")
# print(final_summary3)


# # Count person per "Age range' from dropped duplicate data (dup_out)

# ### Redoing code for dup_out, and changing name, to avoid code breaks #################
# bins = [0, 9.9, 13.99, 19.9, 24.9, 29.9, 34.9, 39.9, 49.9]
# # Create labels for the bins
# group_labels = ["<10", "10-14", "15-19","20-24","25-29","30-34","35-39", "40+"]

# reduced_df = purchase_data_df[["SN","Age"]]

# # Drop duplicate SNs
# dup_out2 = reduced_df.drop_duplicates(subset=['SN','Age'], keep="first")   #Returns 576, corresponds to prior calcs, correct
# dup_out2.dtypes

# #Categorise players
# dup_out2["Ages"] = pd.cut(dup_out2["Age"], bins, labels= group_labels)
# dup_out2.head()
# ########################################################################################

# # # Find purchase count per Age range (count 'Purchase ID')###
# ranges_valuecounts = dup_out2["Ages"].value_counts()
# ranges_valuecounts2 = ranges_valuecounts.reset_index()
# # ranges_valuecounts.set_index("index")
# # print(final_summary3)
# ranges_valuecounts2
# # final_summary2["Avg Total Purchase per Person"] = final_summary2["Total Purchase Value"] / ranges_valuecounts2["Ages"]

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
Age Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<10,23,$3.35,$77.13
10-14,26,$2.92,$75.87
15-19,138,$3.04,$419.80
20-24,365,$3.05,"$1,114.06"
25-29,101,$2.90,$293.00
30-34,73,$2.93,$214.00
35-39,41,$3.60,$147.67
40+,13,$2.94,$38.24


## Top Spenders

* Run basic calculations to obtain the results in the table below


* Create a summary data frame to hold the results


* Sort the total purchase value column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [106]:
# Read Purchasing File and store into new Pandas data frame
purchase_data_df2 = pd.read_csv(file_to_load)

reduced_top_purchase_data_df = purchase_data_df2.loc[:, ["SN","Price","Purchase ID"]]
print(reduced_top_purchase_data_df)

# GROUPBY SN 
## For Purchase Count
SN_group_PC = reduced_top_purchase_data_df.groupby(["SN"])

# Retrieve 'Purchase Count'
SN_group_PC2 = SN_group_PC.count()
SN_group_PC_renamed = SN_group_PC2.rename(columns={"Price": "Purchase Count"})          # Rename column
SN_group_PC_sort = SN_group_PC_renamed.sort_values("Purchase Count", ascending=False)   # Sort accordingly
final_SN_group_PC = SN_group_PC_sort[["Purchase Count"]]                                # Keep only relevant column
final_SN_group_PC

# Retrieve 'Average Purchase Price'
# SN_group_APP = SN_group.mean()
# SN_group_APP


# # Retrieve 'Total Purchase Value'
# SN_group_TPV = SN_group.sum()
# print(SN_group_TPV)




# #Convert count series output into dataframe, set headers titles, sort values ascending order
# final_compare_df = gender_group.count().rename_axis('Gender').reset_index(name='Total Count').sort_values(['Total Count'], ascending=False)
# # final_compare_df2 = final_compare_df[["Gender","Total Count"]]  #Don't know how to drop index
# final_compare_df3 = final_compare_df.set_index("Gender")






# test = SN_group2["Purhcase ID"].count().reset_index()
# print(test)
# Count how many fall within each bin by using count and print series


                SN  Price  Purchase ID
0          Lisim78   3.53            0
1      Lisovynya38   1.56            1
2       Ithergue48   4.88            2
3    Chamassasya86   3.27            3
4        Iskosia90   1.44            4
..             ...    ...          ...
775     Aethedru70   3.54          775
776         Iral74   1.63          776
777     Yathecal72   3.46          777
778        Sisur91   4.19          778
779      Ennrian78   4.60          779

[780 rows x 3 columns]


Unnamed: 0_level_0,Purchase Count
SN,Unnamed: 1_level_1
Lisosia93,5
Iral74,4
Idastidru52,4
Asur53,3
Inguron55,3
...,...
Hala31,1
Haisurra41,1
Hailaphos89,1
Haestyphos66,1


## Most Popular Items

* Retrieve the Item ID, Item Name, and Item Price columns


* Group by Item ID and Item Name. Perform calculations to obtain purchase count, item price, and total purchase value


* Create a summary data frame to hold the results


* Sort the purchase count column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92,Final Critic,13,$4.61,$59.99
178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
145,Fiery Glass Crusader,9,$4.58,$41.22
132,Persuasion,9,$3.22,$28.99
108,"Extraction, Quickblade Of Trembling Hands",9,$3.53,$31.77


## Most Profitable Items

* Sort the above table by total purchase value in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the data frame



Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92,Final Critic,13,$4.61,$59.99
178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
82,Nirvana,9,$4.90,$44.10
145,Fiery Glass Crusader,9,$4.58,$41.22
103,Singed Scalpel,8,$4.35,$34.80
