# LEVEL BASED CUSTOMER SEGMENTATION

In [21]:
import pandas as pd

1. Read the users and purchases data sets and merge the data sets with inner join according to the "uid" variable.
2. How many unique clients are there?
3. How many unique pricing are there?
4. How many pieces were sold at what price?
5. How many sales have been made in which country?
6. How much is the total earned from sales by country?
7. What are the sales numbers according to the device types?
8. What are the price averages by country?
9. What are the price averages according to the devices?
10. What are the price averages according to Country-Device grouping?

In [22]:
users = pd.read_csv('users.csv')
purchases = pd.read_csv('purchases.csv')
df = purchases.merge(users, how="inner", on="uid")

In [23]:
def check_level(dataframe):
    print("How many unique clients are there?\n" , dataframe["uid"].nunique())
    print("########################################################")
    print("How many unique pricing are there?\n" , dataframe["price"].nunique())
    print("########################################################")
    print("How many pieces were sold at what price?\n" , dataframe["price"].value_counts())
    print("########################################################")
    print("How many sales have been made in which country?\n" , dataframe["country"].value_counts())
    print("########################################################")
    print("How much is the total earned from sales by country?\n", dataframe.groupby(["country"]).agg({"price": "sum"}))
    print("########################################################")
    print("What are the sales numbers according to the device types?\n", dataframe.groupby(["device"]).agg({"uid": "count"}))
    print("########################################################")
    print("What are the price averages by country?\n", dataframe.groupby(["country"]).agg({"price": "mean"}))
    print("########################################################")
    print("What are the price averages according to the devices?\n", dataframe.groupby(["device"]).agg({"price": "mean"}))
    print("########################################################")
    print("What are the price averages according to Country-Device grouping?\n", dataframe.groupby(by=["country", "device"]).agg({"price": "mean"}))
    print("########################################################")

In [24]:
check_level(df)

How many unique clients are there?
 1322
########################################################
How many unique pricing are there?
 6
########################################################
How many pieces were sold at what price?
 299    2347
499    2242
599    1848
199    1840
899     372
99      357
Name: price, dtype: int64
########################################################
How many sales have been made in which country?
 USA    3650
BRA    2694
DEU     915
TUR     804
FRA     544
CAN     399
Name: country, dtype: int64
########################################################
How much is the total earned from sales by country?
            price
country         
BRA      1104106
CAN       158901
DEU       374285
FRA       218556
TUR       333996
USA      1473550
########################################################
What are the sales numbers according to the device types?
          uid
device      
and     5345
iOS     3661
###############################################

In [25]:
# What is the total price of grouping of country, device, gender, age
agg_df = df.groupby(by=["country", "device",
                            "gender", "age"]).agg({"price": "sum"}).sort_values(by="price", ascending=False)
agg_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,price
country,device,gender,age,Unnamed: 4_level_1
USA,and,M,15,61550
BRA,and,M,19,45392
DEU,iOS,F,16,41602
USA,and,F,17,40004
USA,and,M,23,39802


In [26]:
agg_df.reset_index(inplace=True)
agg_df.head()

Unnamed: 0,country,device,gender,age,price
0,USA,and,M,15,61550
1,BRA,and,M,19,45392
2,DEU,iOS,F,16,41602
3,USA,and,F,17,40004
4,USA,and,M,23,39802


In [27]:
# Convert "age" variable to categorical variable and add it to agg_df with the name "age_cat"
agg_df["age_cat"]=pd.cut(agg_df["age"], [0, 18, 23, 40, agg_df["age"].max()],
                         labels=["0_18","19_23","24_40","41_" + str(agg_df["age"].max())])
agg_df.head()

Unnamed: 0,country,device,gender,age,price,age_cat
0,USA,and,M,15,61550,0_18
1,BRA,and,M,19,45392,19_23
2,DEU,iOS,F,16,41602,0_18
3,USA,and,F,17,40004,0_18
4,USA,and,M,23,39802,19_23


In [28]:
# Define the new level based customer segment and add it to the data set as a variable
agg_df["customers_level_based"] = [row[0].upper() + "_" 
                                   + row[1].upper() + "_" 
                                   + row[2].upper() + "_" 
                                   + row[5].upper() for row in agg_df.values]
agg_df.head()

Unnamed: 0,country,device,gender,age,price,age_cat,customers_level_based
0,USA,and,M,15,61550,0_18,USA_AND_M_0_18
1,BRA,and,M,19,45392,19_23,BRA_AND_M_19_23
2,DEU,iOS,F,16,41602,0_18,DEU_IOS_F_0_18
3,USA,and,F,17,40004,0_18,USA_AND_F_0_18
4,USA,and,M,23,39802,19_23,USA_AND_M_19_23


In [29]:
# Selecet "customers_level_based" and "price" 
select_col = ["customers_level_based", "price"]
agg_df = agg_df[select_col]
agg_df.head()

Unnamed: 0,customers_level_based,price
0,USA_AND_M_0_18,61550
1,BRA_AND_M_19_23,45392
2,DEU_IOS_F_0_18,41602
3,USA_AND_F_0_18,40004
4,USA_AND_M_19_23,39802


In [30]:
# segment customers ("customers_level_based") into 4 groups (with qcut function)
agg_df["segment"] = pd.qcut(agg_df["price"], 4, labels=["D", "C", "B", "A"])
agg_df.head()

Unnamed: 0,customers_level_based,price,segment
0,USA_AND_M_0_18,61550,A
1,BRA_AND_M_19_23,45392,A
2,DEU_IOS_F_0_18,41602,A
3,USA_AND_F_0_18,40004,A
4,USA_AND_M_19_23,39802,A


In [31]:
#Average price of each segmentation
print("Average price of each segmentation: \n", agg_df.groupby(["segment"]).agg({"price": "mean"}))

Average price of each segmentation: 
                 price
segment              
D         1335.096491
C         3675.504505
B         7447.812500
A        20080.150442


In [32]:
#new user
new_user = "TUR_IOS_F_41_75"
agg_df[agg_df["customers_level_based"] == "TUR_IOS_F_41_75"]

Unnamed: 0,customers_level_based,price,segment
377,TUR_IOS_F_41_75,1596,D


In [33]:
# Create Level Based Persona Function
def level_based_persona(dataframe):
    agg_df = df.groupby(by=["country", "device",
                            "gender", "age"]).agg({"price": "sum"}).sort_values(by="price", ascending=False)
    agg_df.reset_index(inplace=True)
    agg_df["age_cat"]=pd.cut(agg_df["age"], [0, 18, 23, 40, agg_df["age"].max()], 
                             labels=["0_18","19_23","24_40","41_" + str(agg_df["age"].max())])
    agg_df["customers_level_based"] = [row[0].upper() + "_" + row[1].upper() +
                                       "_" + row[2].upper() + "_" + row[5].upper() for row in agg_df.values]
    select_col = ["customers_level_based", "price"]
    agg_df = agg_df[select_col]
    agg_df["segment"] = pd.qcut(agg_df["price"], 4, labels=["D", "C", "B", "A"])
    return agg_df

In [34]:
final = level_based_persona(df)
final[final["customers_level_based"] == "TUR_IOS_F_41_75"]

Unnamed: 0,customers_level_based,price,segment
377,TUR_IOS_F_41_75,1596,D
