<a href="https://colab.research.google.com/github/ezgior/Persona/blob/main/segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Customer Segmentation



In [1]:
import numpy as np
import pandas as pd


In [15]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)


In [3]:
df = pd.read_csv('sample_data/persona.csv')


In [16]:
check_df(df)

##################### Shape #####################
(5000, 5)
##################### Types #####################
PRICE       int64
SOURCE     object
SEX        object
COUNTRY    object
AGE         int64
dtype: object
##################### Head #####################
   PRICE   SOURCE   SEX COUNTRY  AGE
0     39  android  male     bra   17
1     39  android  male     bra   17
2     49  android  male     bra   17
3     29  android  male     tur   17
4     49  android  male     tur   17
##################### Tail #####################
      PRICE   SOURCE     SEX COUNTRY  AGE
4995     29  android  female     bra   31
4996     29  android  female     bra   31
4997     29  android  female     bra   31
4998     39  android  female     bra   31
4999     29  android  female     bra   31
##################### NA #####################
PRICE      0
SOURCE     0
SEX        0
COUNTRY    0
AGE        0
dtype: int64
##################### Quantiles #####################
       0.00  0.05  0.50  0.95  0.99

In [4]:
df.head()


Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [7]:
df.shape


(5000, 5)

In [9]:
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PRICE,5000.0,34.132,12.464897,9.0,29.0,39.0,39.0,59.0
AGE,5000.0,23.5814,8.995908,15.0,17.0,21.0,27.0,66.0


In [10]:
df.columns
df.reset_index()


Unnamed: 0,index,PRICE,SOURCE,SEX,COUNTRY,AGE
0,0,39,android,male,bra,17
1,1,39,android,male,bra,17
2,2,49,android,male,bra,17
3,3,29,android,male,tur,17
4,4,49,android,male,tur,17
...,...,...,...,...,...,...
4995,4995,29,android,female,bra,31
4996,4996,29,android,female,bra,31
4997,4997,29,android,female,bra,31
4998,4998,39,android,female,bra,31


In [11]:

df.isnull().sum()

PRICE      0
SOURCE     0
SEX        0
COUNTRY    0
AGE        0
dtype: int64

In [14]:
agg_df = df.groupby(["COUNTRY","SOURCE","SEX","AGE"]).mean().sort_values(by ="PRICE", ascending = False)
agg_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PRICE
COUNTRY,SOURCE,SEX,AGE,Unnamed: 4_level_1
bra,android,male,46,59.0
usa,android,male,36,59.0
fra,android,female,24,59.0
usa,ios,male,32,54.0
deu,android,female,36,49.0


In [17]:

agg_df = agg_df.reset_index()
agg_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE
0,bra,android,male,46,59.0
1,usa,android,male,36,59.0
2,fra,android,female,24,59.0
3,usa,ios,male,32,54.0
4,deu,android,female,36,49.0


In [18]:
mylabels = ['0_18', '19_23', '24_30', '31_40', '41_' + str(agg_df["AGE"].max())]

In [20]:
bins = [0, 18, 23, 30, 40, agg_df["AGE"].max()]
agg_df["age_cat"] = pd.cut(agg_df["AGE"], bins, labels=mylabels)
agg_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,age_cat
0,bra,android,male,46,59.0,41_66
1,usa,android,male,36,59.0,31_40
2,fra,android,female,24,59.0,24_30
3,usa,ios,male,32,54.0,31_40
4,deu,android,female,36,49.0,31_40


In [23]:
agg_df["customers_level_based"] = [row[0].upper() + "_" + row[1].upper() + "_" + row[2].upper() + "_" +
                                   row[5].upper() for row in agg_df.values]
agg_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,age_cat,customers_level_based
0,bra,android,male,46,59.0,41_66,BRA_ANDROID_MALE_41_66
1,usa,android,male,36,59.0,31_40,USA_ANDROID_MALE_31_40
2,fra,android,female,24,59.0,24_30,FRA_ANDROID_FEMALE_24_30
3,usa,ios,male,32,54.0,31_40,USA_IOS_MALE_31_40
4,deu,android,female,36,49.0,31_40,DEU_ANDROID_FEMALE_31_40


Now  we have combined information of every customer in a new column.

In [24]:
# Remove the unnecessary columns:
agg_df = agg_df[["customers_level_based", "PRICE"]]
agg_df.head()

Unnamed: 0,customers_level_based,PRICE
0,BRA_ANDROID_MALE_41_66,59.0
1,USA_ANDROID_MALE_31_40,59.0
2,FRA_ANDROID_FEMALE_24_30,59.0
3,USA_IOS_MALE_31_40,54.0
4,DEU_ANDROID_FEMALE_31_40,49.0


Check if there is same rows in the list


In [25]:
agg_df["customers_level_based"].value_counts()

BRA_ANDROID_FEMALE_24_30    7
USA_IOS_FEMALE_24_30        7
USA_ANDROID_MALE_24_30      7
BRA_ANDROID_MALE_24_30      7
USA_ANDROID_MALE_41_66      7
                           ..
CAN_ANDROID_MALE_41_66      1
TUR_IOS_FEMALE_41_66        1
CAN_ANDROID_FEMALE_41_66    1
FRA_ANDROID_MALE_24_30      1
FRA_ANDROID_FEMALE_31_40    1
Name: customers_level_based, Length: 109, dtype: int64

In order to combine them we can get their mean

In [28]:
agg_df = agg_df.groupby("customers_level_based").agg({"PRICE": "mean"})
agg_df.head()

Unnamed: 0_level_0,PRICE
customers_level_based,Unnamed: 1_level_1
BRA_ANDROID_FEMALE_0_18,35.645303
BRA_ANDROID_FEMALE_19_23,34.07734
BRA_ANDROID_FEMALE_24_30,33.863946
BRA_ANDROID_FEMALE_31_40,34.898326
BRA_ANDROID_FEMALE_41_66,36.737179


In [29]:
agg_df = agg_df.reset_index()
agg_df.head()

Unnamed: 0,customers_level_based,PRICE
0,BRA_ANDROID_FEMALE_0_18,35.645303
1,BRA_ANDROID_FEMALE_19_23,34.07734
2,BRA_ANDROID_FEMALE_24_30,33.863946
3,BRA_ANDROID_FEMALE_31_40,34.898326
4,BRA_ANDROID_FEMALE_41_66,36.737179


In [30]:
agg_df["SEGMENT"] = pd.qcut(agg_df["PRICE"], 4, labels=["D", "C", "B", "A"])
agg_df.head(5)
agg_df.groupby("SEGMENT").agg({"PRICE": "mean"})

Unnamed: 0_level_0,PRICE
SEGMENT,Unnamed: 1_level_1
D,29.20678
C,33.509674
B,34.999645
A,38.691234


Now we can learn a new customer segment using this dataframe



In [32]:
new_user = "FRA_IOS_FEMALE_24_30"
agg_df[agg_df["customers_level_based"] == new_user]

Unnamed: 0,customers_level_based,PRICE,SEGMENT
62,FRA_IOS_FEMALE_24_30,27.714286,D
