<a href="https://colab.research.google.com/github/feyzayavuzFY/Rule_Based_Classification/blob/main/Rule_Based_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
#required libraries
import numpy as np
import pandas as pd

In [65]:
# loading dataset
df_ = pd.read_csv("persona.csv")
df = df_.copy()

#general information about the dataset

In [47]:
df.head()

Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [48]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PRICE,5000.0,34.132,12.464897,9.0,29.0,39.0,39.0,59.0
AGE,5000.0,23.5814,8.995908,15.0,17.0,21.0,27.0,66.0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   PRICE    5000 non-null   int64 
 1   SOURCE   5000 non-null   object
 2   SEX      5000 non-null   object
 3   COUNTRY  5000 non-null   object
 4   AGE      5000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 234.4+ KB


In [50]:
print(df.isnull().sum()) # null values
print("#"*20)
print(df.shape)
print("#"*20)
print(df.dtypes)
print("#"*20)
print(df.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T) 

PRICE      0
SOURCE     0
SEX        0
COUNTRY    0
AGE        0
dtype: int64
####################
(5000, 5)
####################
PRICE       int64
SOURCE     object
SEX        object
COUNTRY    object
AGE         int64
dtype: object
####################
       0.00  0.05  0.50  0.95  0.99  1.00
PRICE   9.0  19.0  39.0  49.0  59.0  59.0
AGE    15.0  15.0  21.0  43.0  53.0  66.0


In [51]:
# unique values count
print(df["SOURCE"].nunique()) 
print(df["PRICE"].nunique())

2
6


In [52]:
#frequencies
print(df["SEX"].value_counts())
print(df["SOURCE"].value_counts())
print(df["PRICE"].value_counts())

female    2621
male      2379
Name: SEX, dtype: int64
android    2974
ios        2026
Name: SOURCE, dtype: int64
29    1305
39    1260
49    1031
19     992
59     212
9      200
Name: PRICE, dtype: int64


In [53]:
# How many people were from each country?
print(df["COUNTRY"].value_counts())

usa    2065
bra    1496
deu     455
tur     451
fra     303
can     230
Name: COUNTRY, dtype: int64


In [54]:
# How many sales were made from each country?
df.groupby("COUNTRY").agg({"PRICE": "count"}).sort_values("PRICE", ascending=False)

Unnamed: 0_level_0,PRICE
COUNTRY,Unnamed: 1_level_1
usa,2065
bra,1496
deu,455
tur,451
fra,303
can,230


In [55]:
# total price from sales by country
df.groupby("COUNTRY").agg({"PRICE": "sum"}).sort_values("PRICE", ascending=False)

Unnamed: 0_level_0,PRICE
COUNTRY,Unnamed: 1_level_1
usa,70225
bra,51354
tur,15689
deu,15485
fra,10177
can,7730


In [66]:
# What are the average earnings in breakdown of COUNTRY, SOURCE, SEX and AGE
# sort and save the output
new_df = df.groupby(["COUNTRY", "SOURCE","SEX","AGE"]).agg({"PRICE": "mean"}).sort_values("PRICE", ascending=False)
new_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PRICE
COUNTRY,SOURCE,SEX,AGE,Unnamed: 4_level_1
bra,android,male,46,59.0
usa,android,male,36,59.0
fra,android,female,24,59.0
usa,ios,male,32,54.0
deu,android,female,36,49.0
...,...,...,...,...
usa,ios,female,38,19.0
usa,ios,female,30,19.0
can,android,female,27,19.0
fra,android,male,18,19.0


In [67]:
# Convert the names in the index to variable names
new_df.reset_index(inplace=True)

In [68]:
new_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE
0,bra,android,male,46,59.0
1,usa,android,male,36,59.0
2,fra,android,female,24,59.0
3,usa,ios,male,32,54.0
4,deu,android,female,36,49.0


In [74]:
# Convert AGE variable to categorical variable and add it to new_df

new_df["CAT_AGE"] = pd.cut(new_df["AGE"], bins=[0, 18, 23, 30, 40, new_df["AGE"].max()],labels=["0_18", "19_23", "24_30", "31_40", "41_" + str(new_df["AGE"].max())])


In [75]:
new_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,CAT_AGE
0,bra,android,male,46,59.0,41_66
1,usa,android,male,36,59.0,31_40
2,fra,android,female,24,59.0,24_30
3,usa,ios,male,32,54.0,31_40
4,deu,android,female,36,49.0,31_40


In [76]:
# define new level-based customers
# and add it to dataset
new_df["Customers_Level_Based"] = [(new_df.COUNTRY[i].upper() + "_" + new_df.SOURCE[i].upper() + "_" + new_df.SEX[i].upper() + "_" + str(new_df.CAT_AGE[i]).upper()) for i in new_df.index]


In [77]:
new_df = new_df.loc[:, ["Customers_Level_Based", "PRICE"]]
new_df = new_df.groupby(["Customers_Level_Based"]).agg({"PRICE": "mean"}).reset_index().sort_values("PRICE", ascending=False)

In [79]:
new_df.head()

Unnamed: 0,Customers_Level_Based,PRICE
54,FRA_ANDROID_FEMALE_24_30,45.428571
86,TUR_IOS_MALE_24_30,45.0
87,TUR_IOS_MALE_31_40,42.333333
72,TUR_ANDROID_FEMALE_31_40,41.833333
25,CAN_ANDROID_MALE_19_23,40.111111


#Segment new customers
* Divide into 4 segments according to the PRICE variable

In [85]:
new_df["SEGMENT"] = pd.qcut(new_df["PRICE"], 4, labels=["D","C","B","A"])
cal = ["mean", "max","sum"]
new_df.groupby(["SEGMENT"]).agg({"PRICE": cal})

SEGMENT_C = new_df[new_df["SEGMENT"]=="C"]
SEGMENT_C.head()

Unnamed: 0,Customers_Level_Based,PRICE,SEGMENT
1,BRA_ANDROID_FEMALE_19_23,34.07734,C
12,BRA_IOS_FEMALE_24_30,34.015873,C
60,FRA_ANDROID_MALE_41_66,34.0,C
44,DEU_IOS_FEMALE_19_23,34.0,C
81,TUR_IOS_FEMALE_24_30,34.0,C


In [86]:
SEGMENT_C.describe()

Unnamed: 0,PRICE
count,27.0
mean,33.509674
std,0.492587
min,32.5
25%,33.0
50%,33.627634
75%,34.0
max,34.07734


# Categorize new customers by segment and estimate how much revenue they can generate

* What segment does a 25-year-old Canadian woman using IOS belong to and how much income is expected to earn on average?

In [94]:
new_user_1 = "CAN_IOS_FEMALE_24_30"

In [95]:
new_df[new_df["Customers_Level_Based"] == new_user_1]

Unnamed: 0,Customers_Level_Based,PRICE,SEGMENT
29,CAN_IOS_FEMALE_24_30,31.727273,D
