In [None]:
pip install -r requirements.txt

# Import


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from plotly.subplots import make_subplots
import pickle
import plotly.graph_objects as go
import plotly.express as px

import charset_normalizer
import fuzzywuzzy
from fuzzywuzzy import process
import os
import re
from dateutil import parser

import warnings

warnings.filterwarnings("ignore")
plt.style.use(
    "https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle"
)



# Character encoding


In [3]:
df_name = []
encoding = []
path = []

dataset_folder = "dataset"
for file in os.listdir(dataset_folder):
    file_path = os.path.join(dataset_folder, file)
    path.append(file_path)
    # look at the first ten thousand bytes to guess the character encoding
    with open(file_path, "rb") as rawdata:
        result = charset_normalizer.detect(rawdata.read())
        encoding.append(result["encoding"])
    # check what the character encoding might be
    file_name = os.path.splitext(file)[0]
    df_name.append(file_name)
    print(f"{file_name}: {result}")

context: {'encoding': 'ascii', 'language': 'English', 'confidence': 1.0}
mobile_plan_attr: {'encoding': 'ascii', 'language': 'English', 'confidence': 1.0}
mobile_plan_user: {'encoding': 'ascii', 'language': 'English', 'confidence': 1.0}
user: {'encoding': 'utf-8', 'language': 'English', 'confidence': 1.0}


In [4]:
data_source = (tuple(df_name), tuple(encoding), tuple(path))

In [5]:
context = pd.read_csv(data_source[2][0], encoding=data_source[1][0])
context.sample(5)

Unnamed: 0,id,purpose,go_with,weather,time,viettel_no_0,viettel_no_1,viettel_no_2,to_hanoi,to_other,score,direction
2962,5626,Travel,Family,&Su!nny,10:00,1,1,0,0,1,9,0
6662,3203,Work,!Al& one,Sun?ny,7 AM,1,1,0,0,1,10,0
1191,8286,Travel,F&riend(s),~?&Sunny,6 PM,1,0,0,0,1,0,0
2972,13141,Work,Alone,!&~&Sunny,7:00,1,1,0,1,0,7,1
3431,2339,Visit,A~l! one,Sunn%y,20:00,1,1,0,1,0,7,1


In [6]:
mobile_plan_attr = pd.read_csv(
    data_source[2][1], encoding=data_source[1][1], delimiter=";"
)

mobile_plan_attr.sample(5)

Unnamed: 0,mobile_plan,description,price,duration
1,DATAGOLD,"5GB/ day, high speed",200000,5d
3,SOCIALMEDIAGOLD,"3GB/ day, high speed\nUnlimited for Tik Tok, F...",250000,3d
4,DATACALL,"2GB/ day, high speed\n300 mins call for extern...",200000,5d
2,SOCIALMEDIA,"1GB/ day\nUnlimited for Tik Tok, Facebook, You...",150000,3d
0,DATASILVER,"2GB/ day, high speed",100000,5d


In [7]:
mobile_plan_user = pd.read_csv(data_source[2][2], encoding=data_source[1][2])
mobile_plan_user.sample(5)

Unnamed: 0,id,mobile_plan,accept
34928,5525.0,SOCIALMEDIAGOLD,0.0
33219,2860.0,DATACALL,0.0
41765,16381.0,DATASILVER,0.0
41179,5962.0,,0.0
41253,12888.0,DATAGOLD,0.0


In [8]:
user = pd.read_csv(data_source[2][3], encoding=data_source[1][3])
user.sample(5)

Unnamed: 0,id,name,gender,age,education,profession,income,living_with,nation,phone,job,fb_freq,yt_freq,insta_freq,use_less_than_2GB,use_2GB_to_4GB
5154,19029,Katrina Terry,Female,25,Associate,Sales & Related,712500000 VND,S in gle_0,ENGLAND,001-665-337-4120x023,Secondary school teacher,8,3,8,1,0
42,1171,Peter Carney,Other,49,Bachelor,Education&Training&Library,83100$,M a rried_4,ENGLAND,630-292-2627x81661,Product designer,0,1,1,3,0
11360,6043,Anita Adkins,Female,22,,Healthcare Support,17600$,Mar ri ed_0,ENGLAND,674-269-5386x724,Adult nurse,1,3,1,1,0
1275,3101,Dana Garcia,Female,32,,Education&Training&Library,1522500000 VND,M a rr ied_4,ENGLAND,399-456-9286x050,"Horticulturist, commercial",1,2,7,8,0
4708,10330,Susan Alexander,Female,21,,Food Preparation & Serving Related,150000000 VND,Unmarried_2,US,686-836-8538x3898,Legal executive,0,2,8,21,7


In [9]:
dataset = [context, mobile_plan_attr, mobile_plan_user, user]

# Cleaning

## Missing values check


In [10]:
for x in dataset:
  print('-----------------')
  print(x.isna().sum())

-----------------
id              0
purpose         0
go_with         0
weather         0
time            0
viettel_no_0    0
viettel_no_1    0
viettel_no_2    0
to_hanoi        0
to_other        0
score           0
direction       0
dtype: int64
-----------------
mobile_plan    0
description    0
price          0
duration       0
dtype: int64
-----------------
id                0
mobile_plan    2715
accept         2721
dtype: int64
-----------------
id                      0
name                    0
gender                  0
age                     0
education            4003
profession              0
income                  0
living_with             0
nation                  0
phone                   0
job                     0
fb_freq                 0
yt_freq                 0
insta_freq              0
use_less_than_2GB       0
use_2GB_to_4GB          0
dtype: int64


In [11]:
mobile_plan_user = mobile_plan_user.dropna(subset=['mobile_plan', 'accept'])

## Duplicate


In [12]:
user['id'].duplicated().sum()

0

In [13]:
context['id'].duplicated().sum()

0

In [14]:
mobile_plan_user.sort_values(by='id').head()

Unnamed: 0,id,mobile_plan,accept
18023,1000.0,SOCIALMEDIA,0.0
18022,1000.0,DATACALL,0.0
18021,1000.0,DATASILVER,0.0
3014,1000.0,DATASILVER,1.0
3087,1001.0,SOCIALMEDIA,1.0


## Context


In [15]:
context.sample(5)

Unnamed: 0,id,purpose,go_with,weather,time,viettel_no_0,viettel_no_1,viettel_no_2,to_hanoi,to_other,score,direction
1934,3884,Travel,Alone,Su!&!nny,10 AM,1,0,0,0,1,0,0
9964,5639,Visit,Alone,Su?&nny,3PM,1,0,0,1,0,0,1
6066,2123,Travel,Friend(s),Sunny,17:00,1,0,0,0,1,15,0
7763,3494,Work,??!!Alone,S??~%unny,7AM,1,1,0,0,1,0,0
10123,6158,Visit,Alo ne,Snowy,19:00,1,1,1,0,1,0,0


### Go with and weather


In [16]:
for x in ["go_with", "weather"]:
    # Eliminate special characters and white spaces
    context[x] = context[x].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x))

    # Lowercase all words
    context[x] = context[x].str.lower()

    print(context[x].unique())

['alone' 'friends' 'fa mily' 'family' 'fami ly' 'a l one' 'fr iends'
 ' alone' 'a lone' '  alone' ' friends' 'f riends' 'f rie nds' 'frie nds'
 'frien ds' 'f amily' ' family' 'alo ne' 'friend s' 'al one' 'fri ends'
 ' a lone' ' fri ends' 'f riend s' 'alon e' 'frien  ds' ' fri e nds'
 'friends ' 'a  lone' 'fa  mily' 'al  one' '   alone' 'fri e nds'
 ' al one' 'fam ily' '  family' 'f rien ds' '  a lone' ' f amily'
 'frie   nds' 'fri en ds' 'f a mily' ' frie nds' 'f  amily' 'fr  iends'
 ' fa mily' 'famil y' ' fr iends' 'fr ie nds' 'fr ien ds' ' f  amily'
 'fam  ily' ' frien ds' 'frie nd s' 'frie n ds' ' friend s' 'frie  nds'
 'fr i ends' 'fri  ends' 'f ri ends' ' a  lone' ' alo ne' 'friend  s'
 'a   lone' '  friends' '   friends' ' fam ily' 'f r iends' 'alo  ne'
 '  a  lone' ' fa  mily' 'f r  iends' 'f  riends' ' frie  nds' 'fri end s'
 ' fami ly' 'f rie  nds' ' f riends' ' al  one' 'f a  mily' ' fa m ily'
 '    alone' 'f  ri ends' 'a lo ne' 'frien d s' 'f am ily' '  f riends'
 '   family

In [17]:
# List of possible values for the column 'go_with'
choices = ["alone", "friend(s)", "family"]


# Apply the fuzzy matching to the column 'go_with'
def correct_name(name):
    return process.extractOne(name, choices)[0]


context["go_with"] = context["go_with"].apply(correct_name)

### Time


In [18]:
# Eliminate white spaces
context["time"] = context["time"].str.replace(r"\s+", "", regex=True)


# Function to convert time to 24-hour format
def convert_to_24hr_format(time_str):
    # Use dateutil.parser to parse the time string
    return parser.parse(time_str).strftime("%H:%M")


# Apply the function to the column 'time'
context["time"] = context["time"].apply(convert_to_24hr_format)
context["time"] = pd.to_datetime(context["time"], format="%H:%M")

def time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

context['hour'] = pd.to_datetime(context['time']).dt.hour
context['time_of_day'] = context['hour'].apply(time_of_day)

context = context.drop(columns=['time', 'hour'])

### Purpose


In [19]:
context["purpose"] = context["purpose"].str.strip()
context["purpose"] = context["purpose"].str.lower()

In [20]:
context.purpose.unique()

array(['travel', 'visit', 'work'], dtype=object)

In [21]:
context["purpose"] = context["purpose"].replace('travel', 'visit')

### to_hanoi, to_other, direction


In [22]:
context = context.drop(axis=1, columns=["to_hanoi", "to_other"])

In [23]:
context.sample(5)

Unnamed: 0,id,purpose,go_with,weather,viettel_no_0,viettel_no_1,viettel_no_2,score,direction,time_of_day
9183,4165,visit,alone,sunny,1,1,0,0,1,afternoon
8528,8973,visit,alone,sunny,1,0,0,10,1,evening
11465,3977,visit,alone,sunny,1,0,0,17,1,evening
1900,1432,visit,friend(s),sunny,1,1,0,0,0,night
10496,2097,work,alone,snowy,1,1,0,8,1,morning


## User


In [24]:
user.sample(5)

Unnamed: 0,id,name,gender,age,education,profession,income,living_with,nation,phone,job,fb_freq,yt_freq,insta_freq,use_less_than_2GB,use_2GB_to_4GB
9266,12280,Brian Alvarado,Male,39,Bachelor,Computer & Mathematical,59200$,Mar ried_2,US,559.476.7840,"Scientist, product/process development",2,0,1,7,0
9895,18035,ÊòìÂ©∑Â©∑,Female,38,Associate,Unemployed,80300$,Un married_0,CHINA,15097104939,ÁΩëÂ∫óÊ∑òÂÆù,0,0,23,9,1
3338,8572,Jose Lewis,Male,40,,Computer & Mathematical,37500$,Sin gle_0,ENGLAND,001-708-563-1937x10235,Armed forces technical officer,1,1,8,2,1
5204,1641,Asafe Machado,Male,21,,Student,585000000 VND,Un m ar ried_0,BRAZIL,+55 31 8395 9918,Profissional de recursos humanos,0,0,9,7,1
8490,6878,David Scott,Male,23,Highschool,Unemployed,910000000 VND,Single_0,US,942-659-5156x10546,Science writer,1,1,7,2,0


### living_with


In [25]:
# Eliminate white spaces
user["living_with"] = user["living_with"].str.strip()
user["living_with"] = user["living_with"].str.replace(r"\s+", "", regex=True)
user["living_with"] = user["living_with"].str.lower()

### nation


In [26]:
user["nation"] = user["nation"].str.strip()
user["nation"] = user["nation"].str.lower()

user["nation"] = user["nation"].replace("australia", "oceania")
user["nation"] = user["nation"].replace(["brazil", "us"], "americas")
user["nation"] = user["nation"].replace(["denmark", "england", "russia"], "europe")
user["nation"] = user["nation"].replace(["korea", "japan", "china"], "asia")

user = user.rename(columns={"nation": "continent"})

### gender


In [27]:
user["gender"] = user["gender"].str.strip()
user["gender"] = user["gender"].str.lower()

### education


In [28]:
user["education"] = user["education"].str.strip()
user["education"] = user["education"].str.lower()

user["education"] = user["education"].replace("bachelor", "grad")
user["education"] = user["education"].replace(["associate", "highschool"], "undergrad")
user["education"] = user["education"].replace("masters", "postgrad")
user["education"] = user["education"].fillna('unknown')

### income


In [29]:
currency_exchange_rate = 23000

# Function to clean and adjust income values
def clean_income(value):
    if 'VND' in value:
        return int(value.replace('VND', '')) / currency_exchange_rate
    elif '$' in value:
        return int(value.replace('$', ''))
    else:
        return int(value)

# Apply the function to the 'income' column
user['income_dollar'] = user['income'].apply(clean_income)

In [30]:
user['income_level'] = user['income_dollar'].apply(lambda x: 'lower' if x <= 30000 else 'lower-middle' if x > 30000 and x <= 58000 else 'middle' if x > 58000 and x <= 94000 else 'upper-middle' if x > 94000 and x <= 153000 else 'upper' if x > 153000 and x <= 200000 else 'extreme upper')

### profession


In [31]:
user['profession'] = user['profession'].str.lower()

user['profession']= user['profession'].apply(lambda x: "jobs" if x not in ['student', 'retired', 'unemployed'] else x)

### marital status


In [32]:
# Seperate Living_With column into Maritual_Status column and Children column
user[["marital_status", "children"]] = user["living_with"].str.split("_", expand=True)


user["marital_status"] = user["marital_status"].apply(
    lambda x: "relationship" if x == "married" else "single"
)

user = user.drop(axis=1, columns=["living_with"])

### job


In [33]:
# from langdetect import detect

# # Function to check if a text is in English
# def is_english(text):
#     try:
#         return detect(text) == 'en'
#     except:
#         return False

# # Create a new column 'is_english' to check if the job is in English
# user['is_english'] = user['job'].apply(is_english)

# # Calculate the percentage of non-English jobs
# non_english_count = user['is_english'].value_counts().get(False, 0)
# total_count = len(user)
# percentage_non_english = (non_english_count / total_count) * 100

# print(f"Other languages: {percentage_non_english:.2f}%")

=> Drop job column


In [34]:
# user = user.drop(columns=['is_english'])
user = user.drop(columns=['job'])

In [35]:
user

Unnamed: 0,id,name,gender,age,education,profession,income,continent,phone,fb_freq,yt_freq,insta_freq,use_less_than_2GB,use_2GB_to_4GB,income_dollar,income_level,marital_status,children
0,11156,Rachel Gibbs,female,21,unknown,unemployed,39100$,oceania,(08)-8012-7556,0,0,9,8,3,39100.000000,lower-middle,single,2
1,4297,Karen Anderson,other,22,unknown,unemployed,41000$,europe,943-646-5203,0,0,23,7,2,41000.000000,lower-middle,single,2
2,13301,ÍπÄÏßÄÏõê,female,24,unknown,unemployed,44300$,asia,010-4500-9888,0,0,23,5,2,44300.000000,lower-middle,single,1
3,9920,Elisabeth W√§hner,female,24,unknown,unemployed,44400$,europe,+49(0) 587406963,0,0,21,7,2,44400.000000,lower-middle,single,1
4,8424,Sra. Maria Luiza Nogueira,female,25,unknown,unemployed,1100000000 VND,americas,(084) 0568 1445,0,0,22,5,3,47826.086957,lower-middle,single,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11567,11602,Austin Barber,male,26,grad,jobs,82000$,europe,001-798-297-3345x825,0,0,3,5,3,82000.000000,middle,single,0
11568,5645,ÎÇ®ÏÉÅÏ≤†,male,26,grad,jobs,83900$,asia,043-900-0635,0,0,2,4,1,83900.000000,middle,single,0
11569,15477,John Blanchard,male,30,grad,jobs,76600$,americas,230-283-6789x89147,0,0,3,7,3,76600.000000,middle,single,0
11570,8885,Troy Green,male,30,grad,jobs,2087500000 VND,oceania,+61.472.119.694,0,0,2,7,3,90760.869565,middle,single,0


## mobile_plan_attr


In [36]:
mobile_plan_attr.head()

Unnamed: 0,mobile_plan,description,price,duration
0,DATASILVER,"2GB/ day, high speed",100000,5d
1,DATAGOLD,"5GB/ day, high speed",200000,5d
2,SOCIALMEDIA,"1GB/ day\nUnlimited for Tik Tok, Facebook, You...",150000,3d
3,SOCIALMEDIAGOLD,"3GB/ day, high speed\nUnlimited for Tik Tok, F...",250000,3d
4,DATACALL,"2GB/ day, high speed\n300 mins call for extern...",200000,5d


In [37]:
mobile_plan_attr['mobile_plan'] = mobile_plan_attr['mobile_plan'].str.lower()

In [38]:
mobile_plan_attr['duration'] = mobile_plan_attr['duration'].str.replace('d', '')

In [39]:
mobile_plan_attr['capacity'] = mobile_plan_attr['description'].str.extract(r'(\d+)GB')

In [40]:
mobile_plan_attr.head()

Unnamed: 0,mobile_plan,description,price,duration,capacity
0,datasilver,"2GB/ day, high speed",100000,5,2
1,datagold,"5GB/ day, high speed",200000,5,5
2,socialmedia,"1GB/ day\nUnlimited for Tik Tok, Facebook, You...",150000,3,1
3,socialmediagold,"3GB/ day, high speed\nUnlimited for Tik Tok, F...",250000,3,3
4,datacall,"2GB/ day, high speed\n300 mins call for extern...",200000,5,2


## mobile_plan_user


In [41]:
mobile_plan_user.head()

Unnamed: 0,id,mobile_plan,accept
0,11156.0,DATASILVER,1.0
1,4297.0,SOCIALMEDIAGOLD,1.0
2,13301.0,DATASILVER,1.0
3,9920.0,SOCIALMEDIAGOLD,1.0
4,8424.0,DATASILVER,1.0


In [42]:
mobile_plan_user['mobile_plan'] = mobile_plan_user['mobile_plan'].str.lower()

In [43]:
mobile_plans = mobile_plan_user.mobile_plan.unique()

In [44]:
mobile_plans

array(['datasilver', 'socialmediagold', 'socialmedia', 'datacall',
       'datagold'], dtype=object)

In [45]:
mobile_plan_user

Unnamed: 0,id,mobile_plan,accept
0,11156.0,datasilver,1.0
1,4297.0,socialmediagold,1.0
2,13301.0,datasilver,1.0
3,9920.0,socialmediagold,1.0
4,8424.0,datasilver,1.0
...,...,...,...
45312,14062.0,datagold,0.0
45314,13710.0,datagold,0.0
45315,10956.0,socialmedia,0.0
45316,14977.0,datasilver,0.0


### Mobile plan recommendation


In [46]:
# mobile_plan_rec_raw = mobile_plan_user[["id"]]

# # Create columns for each item in the mobile_plan_user dataframe
# for item in mobile_planes:
#     mobile_plan_rec_raw[item] = mobile_plan_user["mobile_plan"].apply(
#         lambda x: 1 if x == item else np.nan
#     )

# # Check rows that have duplicated ID
# duplicate_rows = mobile_plan_rec_raw[
#     mobile_plan_rec_raw.duplicated(subset=["id"], keep=False)
# ]
# # Sort by ID
# duplicate_rows.sort_values("id")

# # Fill missing values of each ID group
# # In each ID group, each missing value will be replaced by the last valid value forward (ffill), otherwise, by the next valid value backward (bfill).
# mobile_plan_rec = duplicate_rows.groupby("id").apply(
#     lambda x: x.fillna(method="ffill").fillna(method="bfill")
# )
# mobile_plan_rec = mobile_plan_rec.drop_duplicates()

# mobile_plan_rec = mobile_plan_rec.drop(axis=1, columns="id").reset_index()
# mobile_plan_rec = mobile_plan_rec.drop(axis=1, columns="level_1")

# mobile_plan_rec = mobile_plan_rec.fillna(0)
# mobile_plan_rec

### Mobile plan conversion rate


In [47]:
# mobile_plan_accept = mobile_plan_user.pivot_table(
# index="id", columns="mobile_plan", values="accept", aggfunc="sum", fill_value=0
# ).reset_index()

# # ƒê·∫£m b·∫£o t·∫•t c·∫£ c√°c c·ªôt trong mobile_planes c√≥ m·∫∑t trong b·∫£ng k·∫øt qu·∫£ cu·ªëi c√πng
# for plan in mobile_planes:
#   if plan not in mobile_plan_accept.columns:
#     mobile_plan_accept[plan] = 0

# # S·∫Øp x·∫øp l·∫°i c√°c c·ªôt theo th·ª© t·ª± trong mobile_planes
# mobile_plan_accept = mobile_plan_accept[["id"] + list(mobile_planes)]

In [48]:
# # S·ª≠ d·ª•ng pivot_table ƒë·ªÉ t·ªïng h·ª£p d·ªØ li·ªáu
# num_rec = mobile_plan_user.pivot_table(
#     index="id", columns="mobile_plan", values="mobile_plan", aggfunc="count", fill_value=0
# ).reset_index()

# # ƒê·∫£m b·∫£o t·∫•t c·∫£ c√°c c·ªôt trong mobile_planes c√≥ m·∫∑t trong b·∫£ng k·∫øt qu·∫£ cu·ªëi c√πng
# for plan in mobile_planes:
#     if plan not in num_rec.columns:
#         num_rec[plan] = 0

# # S·∫Øp x·∫øp l·∫°i c√°c c·ªôt theo th·ª© t·ª± trong mobile_planes
# num_rec = num_rec[["id"] + list(mobile_planes)]

# # ƒê·∫£m b·∫£o r·∫±ng c·∫£ hai b·∫£ng ƒë·ªÅu c√≥ c√πng th·ª© t·ª± c·ªôt
# assert list(mobile_plan_accept.columns) == list(num_rec.columns)

# # T·∫°o b·∫£ng mobile_plan_cr b·∫±ng c√°ch chia mobile_plan_accept cho num_rec
# mobile_plan_cr = mobile_plan_accept.copy()

# # T√≠nh t·ª∑ l·ªá chuy·ªÉn ƒë·ªïi (conversion rate)
# for plan in mobile_planes:
#     mobile_plan_cr[plan] = mobile_plan_accept[plan] / num_rec[plan]

# mobile_plan_cr.fillna(0, inplace=True)

# # Hi·ªÉn th·ªã b·∫£ng mobile_plan_cr
# mobile_plan_cr

## Data type check


In [49]:
dataset = [context, mobile_plan_attr, mobile_plan_user, user]
for x in dataset:
    print("-------------------------")
    print(x.dtypes)

-------------------------
id               int64
purpose         object
go_with         object
weather         object
viettel_no_0     int64
viettel_no_1     int64
viettel_no_2     int64
score            int64
direction        int64
time_of_day     object
dtype: object
-------------------------
mobile_plan    object
description    object
price           int64
duration       object
capacity       object
dtype: object
-------------------------
id             float64
mobile_plan     object
accept         float64
dtype: object
-------------------------
id                     int64
name                  object
gender                object
age                    int64
education             object
profession            object
income                object
continent             object
phone                 object
fb_freq                int64
yt_freq                int64
insta_freq             int64
use_less_than_2GB      int64
use_2GB_to_4GB         int64
income_dollar        float64
income_lev

In [50]:
mobile_plan_attr.duration = mobile_plan_attr.duration.astype(int)
mobile_plan_attr.capacity = mobile_plan_attr.capacity.astype(int)


user.children = user.children.astype(int)
user.income_dollar = user.income_dollar.astype(int)

## Input for EDA

In [67]:
eda_df = user.merge(context, on="id", how="left")

eda_df = eda_df.dropna(subset=["mobile_plan", "accept"])

KeyError: ['mobile_plan', 'accept']

In [68]:
eda_df

Unnamed: 0,id,name,gender,age,education,profession,income,continent,phone,fb_freq,...,children,purpose,go_with,weather,viettel_no_0,viettel_no_1,viettel_no_2,score,direction,time_of_day
0,11156,Rachel Gibbs,female,21,unknown,unemployed,39100$,oceania,(08)-8012-7556,0,...,2,visit,alone,sunny,1,0,0,0,0,afternoon
1,4297,Karen Anderson,other,22,unknown,unemployed,41000$,europe,943-646-5203,0,...,2,visit,friend(s),sunny,1,1,0,0,0,morning
2,13301,ÍπÄÏßÄÏõê,female,24,unknown,unemployed,44300$,asia,010-4500-9888,0,...,1,visit,friend(s),sunny,1,1,0,0,0,afternoon
3,9920,Elisabeth W√§hner,female,24,unknown,unemployed,44400$,europe,+49(0) 587406963,0,...,1,visit,friend(s),sunny,1,1,0,30,0,afternoon
4,8424,Sra. Maria Luiza Nogueira,female,25,unknown,unemployed,1100000000 VND,americas,(084) 0568 1445,0,...,1,visit,family,sunny,1,1,0,0,0,morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11567,11602,Austin Barber,male,26,grad,jobs,82000$,europe,001-798-297-3345x825,0,...,0,visit,alone,snowy,1,1,0,0,0,night
11568,5645,ÎÇ®ÏÉÅÏ≤†,male,26,grad,jobs,83900$,asia,043-900-0635,0,...,0,visit,family,sunny,1,1,0,0,1,evening
11569,15477,John Blanchard,male,30,grad,jobs,76600$,americas,230-283-6789x89147,0,...,0,work,alone,snowy,1,0,0,13,1,morning
11570,8885,Troy Green,male,30,grad,jobs,2087500000 VND,oceania,+61.472.119.694,0,...,0,work,alone,snowy,1,1,1,0,0,morning


## Input for model


In [69]:
df = (user
      .merge(context, on="id", how="left"))

df_mobile_plan = pd.DataFrame(mobile_plans, columns=['mobile_plan'])
df1 = df.merge(df_mobile_plan, how='cross')
df1

# Merge the dataframes
merged_df = df1.merge(mobile_plan_user, on=['id', 'mobile_plan'], how='left')

# Create a new column 'mobile_plan_recommend' and replace NaN values with 0
merged_df['mobile_plan_recommend'] = merged_df['accept'].notna().astype(int)

# Create a new column 'mobile_plan_accept' and replace NaN values with 0
merged_df['mobile_plan_accept'] = merged_df['accept'].fillna(0).astype(int)

# Remove the 'accept' column
merged_df = merged_df.drop(columns=['accept'])

In [70]:
# Nh√≥m l·∫°i theo id, school v√† t√≠nh t·ªïng cho c√°c c·ªôt school_recommend v√† school_accept
grouped_df = merged_df.groupby(['id', 'name', 'gender', 'age', 'education', 'profession', 'income',
       'continent', 'phone', 'fb_freq', 'yt_freq', 'insta_freq',
       'use_less_than_2GB', 'use_2GB_to_4GB', 'income_dollar', 'income_level',
       'marital_status', 'children', 'purpose', 'go_with', 'weather',
       'viettel_no_0', 'viettel_no_1', 'viettel_no_2', 'score', 'direction',
       'time_of_day', 'mobile_plan']).agg({
    'mobile_plan_recommend': 'sum',
    'mobile_plan_accept': 'sum'
}).reset_index()

grouped_df['cr'] =  grouped_df['mobile_plan_accept']/grouped_df['mobile_plan_recommend']

grouped_df = grouped_df.fillna(0)

grouped_df[grouped_df['id'] == 1000]

Unnamed: 0,id,name,gender,age,education,profession,income,continent,phone,fb_freq,...,viettel_no_0,viettel_no_1,viettel_no_2,score,direction,time_of_day,mobile_plan,mobile_plan_recommend,mobile_plan_accept,cr
0,1000,Ëë£ÁßÄÊ¢Ö,female,38,postgrad,jobs,35200$,asia,15520630534,1,...,1,1,0,0,0,night,datacall,1,0,0.0
1,1000,Ëë£ÁßÄÊ¢Ö,female,38,postgrad,jobs,35200$,asia,15520630534,1,...,1,1,0,0,0,night,datagold,0,0,0.0
2,1000,Ëë£ÁßÄÊ¢Ö,female,38,postgrad,jobs,35200$,asia,15520630534,1,...,1,1,0,0,0,night,datasilver,2,1,0.5
3,1000,Ëë£ÁßÄÊ¢Ö,female,38,postgrad,jobs,35200$,asia,15520630534,1,...,1,1,0,0,0,night,socialmedia,1,0,0.0
4,1000,Ëë£ÁßÄÊ¢Ö,female,38,postgrad,jobs,35200$,asia,15520630534,1,...,1,1,0,0,0,night,socialmediagold,0,0,0.0


In [71]:
grouped_df['target'] = grouped_df[['mobile_plan_recommend', 'cr']].apply(lambda x: 0 if x['mobile_plan_recommend'] == 0 else 1 if x['mobile_plan_recommend'] > 0 and x['cr'] == 0 else 2 if x['mobile_plan_recommend'] > 0 and x['cr'] < 0.5 else 3, axis=1)

## Outliers


In [72]:
# categorical_columns = ['gender', 'education', 'profession', 'continent', 'income_level', 'marital_status', 'purpose', 'go_with', 'weather', 'viettel_no_0', 'viettel_no_1', 'viettel_no_2', 'direction', 'time_of_day', 'datasilver', 'socialmediagold', 'socialmedia', 'datacall', 'datagold']

# continuous_columns = df.drop(axis=1, columns=categorical_columns).columns

# # Create a canvas with 4 columns and 4 rows
# fig, axes = plt.subplots(3, 3, figsize=(8, 8))

# # Draw boxplot on the canvas
# for i, ax in enumerate(axes.flatten()):
#     if i < len(continuous_columns):
#         sns.boxplot(y=continuous_columns[i], data=df, ax=ax)
#         ax.set_title(f'Boxplot of {continuous_columns[i]}')
#         ax.set_ylabel(continuous_columns[i])
#     else:
#         ax.axis('off')

# # Fit layout
# plt.tight_layout()

# plt.show()

# Feature engineering


## Social media


In [73]:
sm_avg_usage = grouped_df[['fb_freq', 'yt_freq', 'insta_freq']].apply(lambda x: x.sum(), axis=1)
sm_avg_usage

grouped_df['sm_ext'] = sm_avg_usage.apply(lambda x: 1 if x > 10 else 0)

sm_avg_usage = eda_df[['fb_freq', 'yt_freq', 'insta_freq']].apply(lambda x: x.sum(), axis=1)
sm_avg_usage

eda_df['sm_ext'] = sm_avg_usage.apply(lambda x: 1 if x > 10 else 0)

## use_less_than_2GB and use_2GB_to_4GB


In [74]:
usage = ['use_less_than_2GB', 'use_2GB_to_4GB']

for x in usage:
  range = 1.5
  IQR = grouped_df[x].quantile(0.75) - grouped_df[x].quantile(0.25)
  upper_bound = grouped_df[x].quantile(0.75) + (IQR * range)

  grouped_df[f"{x}_ext"] = grouped_df[x].apply(lambda y: 1 if y > upper_bound else 0)

for x in usage:
  range = 1.5
  IQR = eda_df[x].quantile(0.75) - eda_df[x].quantile(0.25)
  upper_bound = eda_df[x].quantile(0.75) + (IQR * range)

  eda_df[f"{x}_ext"] = eda_df[x].apply(lambda y: 1 if y > upper_bound else 0)

## Education


In [75]:
grouped_df['education_abn'] = grouped_df['education'].apply(lambda x: 1 if x == 'unknown' else 0)

eda_df['education_abn'] = eda_df['education'].apply(lambda x: 1 if x == 'unknown' else 0)

## Score


In [76]:
range = 1.5
IQR = grouped_df['score'].quantile(0.75) - grouped_df['score'].quantile(0.25)
upper_bound = grouped_df['score'].quantile(0.75) + (IQR * range)

grouped_df["score_ext"] = grouped_df['score'].apply(lambda y: 1 if y > upper_bound else 0)

IQR = eda_df['score'].quantile(0.75) - eda_df['score'].quantile(0.25)
upper_bound = eda_df['score'].quantile(0.75) + (IQR * range)

eda_df["score_ext"] = eda_df['score'].apply(lambda y: 1 if y > upper_bound else 0)


In [77]:
df.head()

Unnamed: 0,id,name,gender,age,education,profession,income,continent,phone,fb_freq,...,children,purpose,go_with,weather,viettel_no_0,viettel_no_1,viettel_no_2,score,direction,time_of_day
0,11156,Rachel Gibbs,female,21,unknown,unemployed,39100$,oceania,(08)-8012-7556,0,...,2,visit,alone,sunny,1,0,0,0,0,afternoon
1,4297,Karen Anderson,other,22,unknown,unemployed,41000$,europe,943-646-5203,0,...,2,visit,friend(s),sunny,1,1,0,0,0,morning
2,13301,ÍπÄÏßÄÏõê,female,24,unknown,unemployed,44300$,asia,010-4500-9888,0,...,1,visit,friend(s),sunny,1,1,0,0,0,afternoon
3,9920,Elisabeth W√§hner,female,24,unknown,unemployed,44400$,europe,+49(0) 587406963,0,...,1,visit,friend(s),sunny,1,1,0,30,0,afternoon
4,8424,Sra. Maria Luiza Nogueira,female,25,unknown,unemployed,1100000000 VND,americas,(084) 0568 1445,0,...,1,visit,family,sunny,1,1,0,0,0,morning


# EDA

## Univariate


In [78]:
class EDA:
    
    def row(self,data):
        # Display the number of rows and columns in the dataframe
        fig = make_subplots(rows=1, cols=2)
        fig.add_trace(go.Indicator(mode = "number", value = data.shape[0], number={'font':{'color': '#E58F65','size':100}}, title = {"text": "üßæ Rows<br><span style='font-size:0.8em;color:gray'>In the Dataframe</span>"}, domain = {'x': [0, 0.5], 'y': [0.6, 1]}))
        fig.add_trace(go.Indicator(mode = "number", value = data.shape[1], number={'font':{'color': '#E58F65','size':100}}, title = {"text": "‚≠ï Columns<br><span style='font-size:0.8em;color:gray'>In the Dataframe</span>"}, domain = {'x': [0.5, 1], 'y': [0, 0.4]}))
        fig.show()
    
    def border_msg(self,msg, indent=1, width=None, title=None):
        """Print message-box with optional title."""
        lines = msg.split('\n')
        space = " " * indent
        if not width:
            width = max(map(len, lines))
        box = f'‚ïî{"‚ïê" * (width + indent * 2)}‚ïó\n'  
        if title:
            box += f'‚ïë{space}{title:<{width}}{space}‚ïë\n'  
            box += f'‚ïë{space}{"-" * len(title):<{width}}{space}‚ïë\n'  
        box += ''.join([f'‚ïë{space}{line:<{width}}{space}‚ïë\n' for line in lines])
        box += f'‚ïö{"‚ïê" * (width + indent * 2)}‚ïù' 
        print('\033[92m'+'\033[1m')
        print(box)
        
    def distribution(self,x,title):
        # Plot the distribution of a numerical column
        plt.figure(figsize=(10,8))
        ax = sns.distplot(x, kde=False,bins=30)
        values = np.array([rec.get_height() for rec in ax.patches])
        norm = plt.Normalize(values.min(), values.max())
        colors = plt.cm.jet(norm(values))
        for rec, col in zip(ax.patches,colors):
            rec.set_color(col)
        plt.title(title, size=20, color='black')
        
    def run(self,df):
        
        self.row(df)
        if len(df)>0:
            
            object_df = df.select_dtypes('object').columns.tolist()
            int_df = df.select_dtypes('int').columns.tolist()
            bool_df = df.select_dtypes('bool').columns.tolist()
            float_df = df.select_dtypes('float').columns.tolist()

            if len(object_df)>0:
                
                print( '\033[1m'+"OBJECT TYPE")
                for col in object_df:
                    # Display information about object type columns
                    self.border_msg(' '*25+ col.upper() + ' '*25)
                    self.border_msg('There are {} unique values in {} column'.format(df[col].nunique(),col.upper()))
                    plt.figure(figsize=(10,5))
                    sns.countplot(y = col, data = df,
                                  order = df[col].value_counts().index)
                    plt.show()
                    
            if len(int_df)>0:
                
                print('\033[1m'+"INT TYPE")
                for col in int_df:
                    # Display information about integer type columns
                    self.border_msg(' '*25+ col.upper() + ' '*25)
                    self.border_msg('Average value is : {}'.format(df[col].mean()))
                    self.border_msg('Minumum value is : {}'.format(df[col].min()))
                    self.border_msg('Maximum value is : {}'.format(df[col].max()))
                    self.distribution(df[col],title=col)
                    if df[col].mean()>df[col].std():
                        print(self.border_msg("Normal distributed Data Located below mean"))
                        
                    elif df[col].mean()<df[col].std():
                        print(self.border_msg("Normal distributed Data Located above mean"))
                    else:
                        self.border_msg("Mean Equals Std Dev - Distribution is normal")
                        
                    fig = make_subplots(rows=1, cols=2)
                    fig.add_trace(go.Indicator(mode = "number", value = df[col].mean(), number={'font':{'color': '#E58F65','size':100}}, title = {"text": "üìå Mean<br><span style='font-size:0.8em;color:gray'></span>"}, domain = {'x': [0, 0.5], 'y': [0.6, 1]}))
                    fig.add_trace(go.Indicator(mode = "number", value = df[col].std(), number={'font':{'color': '#E58F65','size':100}}, title = {"text": "üñá Standart dev<br><span style='font-size:0.8em;color:gray'></span>"}, domain = {'x': [0.5, 1], 'y': [0, 0.4]}))
                    fig.show()
                    plt.show()
                 

            if len(bool_df)>0:
                
                print('\033[1m'+"BOOL TYPE")
                for col in bool_df:
                    # Display information about boolean type columns
                    self.border_msg(' '*25+ col.upper() + ' '*25)
                    plt.figure(figsize=(10,5))
                    sns.countplot(y = col, data = df,
                                  order = df[col].value_counts().index)
                    plt.show()
                    
            if len(float_df)>0:
                
                print('\033[1m'+"FLOAT TYPE")
                for col in float_df:
                    # Display information about float type columns
                    for col in int_df:
                        self.distribution(df[col],title=col)
                        if df[col].mean()>df[col].std():
                            print(self.border_msg("Normal distributed Data Located below mean"))
                        
                        elif df[col].mean()<df[col].std():
                            print(self.border_msg("Normal distributed Data Located above mean"))
                        else:
                            self.border_msg("Mean Equals Std Dev - Distribution is normal")

In [79]:
# frame = EDA().run(df)

In [80]:
eda_df

Unnamed: 0,id,name,gender,age,education,profession,income,continent,phone,fb_freq,...,viettel_no_1,viettel_no_2,score,direction,time_of_day,sm_ext,use_less_than_2GB_ext,use_2GB_to_4GB_ext,education_abn,score_ext
0,11156,Rachel Gibbs,female,21,unknown,unemployed,39100$,oceania,(08)-8012-7556,0,...,0,0,0,0,afternoon,0,0,0,1,0
1,4297,Karen Anderson,other,22,unknown,unemployed,41000$,europe,943-646-5203,0,...,1,0,0,0,morning,1,0,0,1,0
2,13301,ÍπÄÏßÄÏõê,female,24,unknown,unemployed,44300$,asia,010-4500-9888,0,...,1,0,0,0,afternoon,1,0,0,1,0
3,9920,Elisabeth W√§hner,female,24,unknown,unemployed,44400$,europe,+49(0) 587406963,0,...,1,0,30,0,afternoon,1,0,0,1,1
4,8424,Sra. Maria Luiza Nogueira,female,25,unknown,unemployed,1100000000 VND,americas,(084) 0568 1445,0,...,1,0,0,0,morning,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11567,11602,Austin Barber,male,26,grad,jobs,82000$,europe,001-798-297-3345x825,0,...,1,0,0,0,night,0,0,0,0,0
11568,5645,ÎÇ®ÏÉÅÏ≤†,male,26,grad,jobs,83900$,asia,043-900-0635,0,...,1,0,0,1,evening,0,0,0,0,0
11569,15477,John Blanchard,male,30,grad,jobs,76600$,americas,230-283-6789x89147,0,...,0,0,13,1,morning,0,0,0,0,0
11570,8885,Troy Green,male,30,grad,jobs,2087500000 VND,oceania,+61.472.119.694,0,...,1,1,0,0,morning,0,0,0,0,0


In [86]:
mobile_plan_user

Unnamed: 0,id,mobile_plan,accept
0,11156.0,datasilver,1.0
1,4297.0,socialmediagold,1.0
2,13301.0,datasilver,1.0
3,9920.0,socialmediagold,1.0
4,8424.0,datasilver,1.0
...,...,...,...
45312,14062.0,datagold,0.0
45314,13710.0,datagold,0.0
45315,10956.0,socialmedia,0.0
45316,14977.0,datasilver,0.0


In [None]:
mobile_plan_user.id

-----------

In [81]:
df_mplans = {}
for x in mobile_plans:
  df_mplans.update({x: grouped_df[grouped_df['mobile_plan'] == x].drop(axis=1, columns=['mobile_plan', 'mobile_plan_recommend', 'mobile_plan_accept', 'cr'])})

to_drop = ['id', 'name', 'phone', 'income', 'phone']
for x in mobile_plans:
  df_mplans.update({x: df_mplans[x].drop(axis=1, columns=to_drop)})

# Feature selection


## Chi square test for categorical columns


In [82]:
# cat_col = [col for col in categorical_columns if col not in labels]

In [None]:
# from scipy.stats import chi2_contingency

# # Initialize an empty list to store results
# chi_sq_test = []

# # Loop through each label and each categorical column
# for x in labels:
#     for col in cat_col:
#         # Create contingency table
#         contingency_table = pd.crosstab(df[x], df[col])

#         # Perform chi-squared test
#         chi2, p, dof, ex = chi2_contingency(contingency_table)

#         # Append results to the list
#         chi_sq_test.append({
#             'Label': x,
#             'Categorical Column': col,
#             'Chi-squared': chi2,
#             'P-value': p,
#             'Significant (P < 0.05)': p < 0.05
#         })

# # Create a DataFrame from the results list
# chi_sq_test = pd.DataFrame(chi_sq_test)

# # Display the results
# chi_sq_test

In [None]:
# chi_sq_test[chi_sq_test['Significant (P < 0.05)'] == True]['Categorical Column'].unique()

In [None]:
# chi_sq_test[chi_sq_test['Significant (P < 0.05)'] == False]['Categorical Column'].unique()

## T test for continuous columns


In [None]:
# from scipy import stats

# # Kh·ªüi t·∫°o m·ªôt list r·ªóng ƒë·ªÉ l∆∞u c√°c k·∫øt qu·∫£
# point_biserial = []

# # T√≠nh to√°n v√† l∆∞u k·∫øt qu·∫£ v√†o list
# for label in labels:
#     for col in continuous_columns:
#         r, p_value = stats.pointbiserialr(df[label], df[col])
#         point_biserial.append((label, col, r, p_value))

# # T·∫°o DataFrame t·ª´ list k·∫øt qu·∫£
# point_biserial = pd.DataFrame(point_biserial, columns=['Label', 'Continuous Column', 'Point-Biserial Correlation', 'p-value'])

# # Th√™m c·ªôt th·ªèa ƒëi·ªÅu ki·ªán p-value < 0.05
# point_biserial['Significant (P < 0.05)'] = point_biserial['p-value'] < 0.05
# point_biserial

In [None]:
# point_biserial[point_biserial['Significant (P < 0.05)'] == True]['Continuous Column'].unique()

In [None]:
# point_biserial[point_biserial['Significant (P < 0.05)'] == False]['Continuous Column'].unique()

In [None]:
mobile_plans

# model


In [None]:
import joblib
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss
from sklearn.pipeline import Pipeline

In [None]:
# Set random seed
seed = 42

# Initialize classifiers
models = {
    'Random Forest Classifier': RandomForestClassifier(random_state=seed),
    'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=seed),
    'AdaBoost Classifier': AdaBoostClassifier(random_state=seed),
    'Logistic Regression': LogisticRegression(random_state=seed),
    'SVC': SVC(),
    'KNeighbors Classifier': KNeighborsClassifier(),
    'XGBoost Classifier': XGBClassifier(random_state=seed)
}

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

# Prepare to collect results
results = []

# Initialize LabelEncoder dictionary
label_dict = {
    'labels': ['datasilver', 'datagold', 'socialmedia', 'socialmediagold', 'datacall'],
    'cr': ['datasilver_cr', 'datagold_cr', 'socialmedia_cr', 'socialmediagold_cr', 'datacall_cr']
}

In [None]:
# Iterate through each label in label_dict['labels']
for x in mobile_plans:
    print(f"Training model for label {x}...")

    X = df_mplans[x].drop(axis=1, columns=['target'])
    y = df_mplans[x]['target']

    # Encode categorical columns
    check = (X.dtypes == 'object') | (X.dtypes == 'category')
    object_columns = list(check[check].index)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X.columns.difference(object_columns)),
            ('cat', OneHotEncoder(), object_columns)
        ],
        remainder='passthrough'
    )

    model_results = []
    best_model = None
    best_accuracy = 0

    for name, model in models.items():
        accuracy_cv_scores = []
        hamming_cv_losses = []
        precision_scores = []
        recall_scores = []
        f1_scores = []

        # Train the model using KFold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier', model)
            ])

            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)

            # Metrics
            acc = accuracy_score(y_test, y_pred)
            hamming = hamming_loss(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='micro')
            recall = recall_score(y_test, y_pred, average='micro')
            f1 = f1_score(y_test, y_pred, average='micro')

            accuracy_cv_scores.append(acc)
            hamming_cv_losses.append(hamming)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        # Calculate the mean of each metric
        mean_accuracy = np.mean(accuracy_cv_scores)
        mean_hamming_loss = np.mean(hamming_cv_losses)
        mean_precision = np.mean(precision_scores)
        mean_recall = np.mean(recall_scores)
        mean_f1_score = np.mean(f1_scores)

        # Save model results
        model_results.append({
            'Model': name,
            'Mean Accuracy': mean_accuracy,
            'Mean Hamming Loss': mean_hamming_loss,
            'Mean Precision': mean_precision,
            'Mean Recall': mean_recall,
            'Mean F1-score': mean_f1_score
        })

        # Update the best model if current model is better
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_model = pipeline

    # Save the best model for the current label
    joblib.dump(best_model, f'best_model_{x}.pkl')

    # Convert results to DataFrame and sort by Mean Accuracy
    model_results_df = pd.DataFrame(model_results)
    model_results_df = model_results_df.sort_values(by='Mean Accuracy', ascending=False)

    # Keep top 3 models
    top_3_models = model_results_df.head(3)

    # Print top 3 models and their metrics
    print(f"Top 3 models for label {x}:")
    print(top_3_models)
    print("*" * 50)

    # Add label information and append to results
    top_3_models['Label'] = x
    results.append(top_3_models)

# Concatenate all results
final_results = pd.concat(results, ignore_index=True)

In [None]:
final_results

In [None]:
view = pd.merge(final_results.groupby('Label').max('Mean Accuracy'), final_results[['Label', 'Model', 'Mean Accuracy']], on=['Label', 'Mean Accuracy'], how='inner')
view

---


In [None]:
df_mplans['datasilver'].columns