In [244]:
# dataset manipulation
import pandas as pd
# numerical operations
import numpy as np
# graphing applications
import seaborn as sns

[Dataset](https://www.kaggle.com/datasets/utkarshx27/smoking-dataset-from-uk) (link to dataset)

This dataset consists of smoking data gathered from the uk, and contains lots of demographic data along with the type of tobacco consumed, this is my first foray into categorical variables in quite some time, so this will be a good welcoming back. I am planning on selecting my target variable as the `smoke` column which contains a **1** if they are currently a smoker and a **0** if they are not. I will use all the features left after removing that column as my feature space. I may involve feature engineering in the analysis which will help me decide which features to keep and which ones to remove. This is my second health dataset in a row, and I am interested in the trends, as I have a smoker in my family.

In [245]:
## loading in dataset
smoking_df = pd.read_csv('smoking.csv', index_col=0)

smoking_df.head()  # Looks like we have some missing values as well, interesting!

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type
1,Male,38,Divorced,No Qualification,British,White,"2,600 to 5,200",The North,No,,,
2,Female,42,Single,No Qualification,British,White,"Under 2,600",The North,Yes,12.0,12.0,Packets
3,Male,40,Married,Degree,English,White,"28,600 to 36,400",The North,No,,,
4,Female,40,Married,Degree,English,White,"10,400 to 15,600",The North,No,,,
5,Female,39,Married,GCSE/O Level,British,White,"2,600 to 5,200",The North,No,,,


In [246]:
smoking_df.dtypes

gender                    object
age                        int64
marital_status            object
highest_qualification     object
nationality               object
ethnicity                 object
gross_income              object
region                    object
smoke                     object
amt_weekends             float64
amt_weekdays             float64
type                      object
dtype: object

Ase you can see, we have quite a lot of categorical variables to handle (most of the dataset), so first let's get to encoding them, but before we do that, let's analyze and see if any columns contain NaN values. That is our first worry.

In [247]:
contains_nan = {}

for each_column in smoking_df.columns:
    contains_nan[each_column] = smoking_df[each_column].hasnans

contains_nan

{'gender': False,
 'age': False,
 'marital_status': False,
 'highest_qualification': False,
 'nationality': False,
 'ethnicity': False,
 'gross_income': False,
 'region': False,
 'smoke': False,
 'amt_weekends': True,
 'amt_weekdays': True,
 'type': True}

So, it looks like we have about 3 columns that contain NaN values, that is the first part of the dataset we want to fix, after that, we can begin tackling the categorical variables!

In [248]:
# We have one small issue though, one of the NaN columns contains categorical data, not numerical data, so we may have to use frequency to fill those values.

from sklearn.impute import SimpleImputer

type_imputer = SimpleImputer(strategy='most_frequent')

type_imputer.fit(smoking_df[['type']])

values = type_imputer.transform(smoking_df[['type']])

type_imputed_smoking_df = smoking_df.copy()

type_imputed_smoking_df['type'] = values

type_imputed_smoking_df[
    'type'].hasnans  # Yay! The missing values have been removed, now we can move onto the other 2 numerical columns

False

In [249]:
# Do the same thing for amt_weekdays, except use the `mean` strategy to impute the missing values

amt_weekdays_imputer = SimpleImputer(strategy='mean')

amt_weekdays_imputer.fit(smoking_df[['amt_weekdays']])

values = amt_weekdays_imputer.transform(smoking_df[['amt_weekdays']])

amt_weekdays_imputed_smoking_df = type_imputed_smoking_df.copy()

amt_weekdays_imputed_smoking_df['amt_weekdays'] = values

amt_weekdays_imputed_smoking_df['amt_weekdays'].hasnans

False

In [250]:
# Do the same thing for the amt_weekends, except use the `mean` strategy to impute the missing values

amt_weekends_imputer = SimpleImputer(strategy='mean')

amt_weekends_imputer.fit(smoking_df[['amt_weekends']])

values = amt_weekends_imputer.transform(smoking_df[['amt_weekends']])

amt_weekends_imputed_smoking_df = amt_weekdays_imputed_smoking_df.copy()

amt_weekends_imputed_smoking_df['amt_weekends'] = values

amt_weekends_imputed_smoking_df['amt_weekends'].hasnans

False

In [251]:
imputed_smoking_df = amt_weekends_imputed_smoking_df.copy()

contains_nan = {}

for each_column in imputed_smoking_df.columns:
    contains_nan[each_column] = imputed_smoking_df[each_column].hasnans

contains_nan

# We've successfully removed all NaN values!

{'gender': False,
 'age': False,
 'marital_status': False,
 'highest_qualification': False,
 'nationality': False,
 'ethnicity': False,
 'gross_income': False,
 'region': False,
 'smoke': False,
 'amt_weekends': False,
 'amt_weekdays': False,
 'type': False}

In [252]:
# Now, lets checkout if we have any odd standard deviations across our dataset


imputed_smoking_df.head(), imputed_smoking_df.dtypes, imputed_smoking_df.describe()  # Doesn't look too bad at all, no insanely high standard deviations

(   gender  age marital_status highest_qualification nationality ethnicity  \
 1    Male   38       Divorced      No Qualification     British     White   
 2  Female   42         Single      No Qualification     British     White   
 3    Male   40        Married                Degree     English     White   
 4  Female   40        Married                Degree     English     White   
 5  Female   39        Married          GCSE/O Level     British     White   
 
        gross_income     region smoke  amt_weekends  amt_weekdays     type  
 1    2,600 to 5,200  The North    No     16.410926     13.750594  Packets  
 2       Under 2,600  The North   Yes     12.000000     12.000000  Packets  
 3  28,600 to 36,400  The North    No     16.410926     13.750594  Packets  
 4  10,400 to 15,600  The North    No     16.410926     13.750594  Packets  
 5    2,600 to 5,200  The North    No     16.410926     13.750594  Packets  ,
 gender                    object
 age                        int64

Let's look at how we can process the `gross_income` column, we can find a way to convert that to numerical, data, let's see what values that column contains, and come up with a method to parse it.

In [253]:
imputed_smoking_df['gross_income'].unique()

array(['2,600 to 5,200', 'Under 2,600', '28,600 to 36,400',
       '10,400 to 15,600', '15,600 to 20,800', 'Above 36,400',
       '5,200 to 10,400', 'Refused', '20,800 to 28,600', 'Unknown'],
      dtype=object)

In [254]:
from sklearn.preprocessing import StandardScaler

gross_income_df = imputed_smoking_df.copy()


def process_gross_income(income_amount: str) -> float:
    income_amount = income_amount.replace(',', '')
    if 'to' in income_amount:
        [start, end] = income_amount.split(' to ')
        return (int(end) + int(start)) / 2
    elif 'Above' in income_amount:
        above_amount = int(income_amount.split('Above ')[1])
        return above_amount + (above_amount / 2)
    elif 'Under' in income_amount:
        under_amount = int(income_amount.split('Under ')[1])
        return under_amount / 2
    else:
        return 0.0


def further_process(income_amount: float, mean: float) -> float:
    if income_amount == 0:
        return mean
    return income_amount


gross_income_df['gross_income'] = gross_income_df['gross_income'].map(lambda x: process_gross_income(x))

gross_income_df['gross_income'] = gross_income_df['gross_income'].map(
    lambda x: further_process(x, float(np.mean(gross_income_df['gross_income']))))

scaler = StandardScaler()

gross_income_df['gross_income'] = scaler.fit_transform(gross_income_df[['gross_income']])

imputed_smoking_df['gross_income'] = gross_income_df['gross_income']

gross_income_df['gross_income'].describe()

count    1.691000e+03
mean     1.589914e-16
std      1.000296e+00
min     -1.048523e+00
25%     -5.222177e-01
50%     -1.011735e-01
75%      3.198707e-01
max      3.267180e+00
Name: gross_income, dtype: float64

We've now processed the gross income column, and successfully converted it to numerical data instead of categorical data. Now we must decide which column we want to preprocess next, let's observe the columns available to us right now.

In [255]:
gross_income_df.dtypes

gender                    object
age                        int64
marital_status            object
highest_qualification     object
nationality               object
ethnicity                 object
gross_income             float64
region                    object
smoke                     object
amt_weekends             float64
amt_weekdays             float64
type                      object
dtype: object

We can try to tackle the region column now, that is strictly categorical data, there is no way we can interpret it as numerical (in the case of gross income, we can interpret above 59000 as some number, but for example with region we can interpret region x as 0, y as 1, z as 2, and so on.

In [256]:
from sklearn.preprocessing import OneHotEncoder

# One hot encoding the categorical values of the region column, and creating a new dataframe from it
dummies = pd.get_dummies(imputed_smoking_df, columns=['region'], drop_first=True)

# copy the dataframe, which will have the `region` column removed and the one-hot encoded ones appended
region_imputed_smoking_df = dummies.copy()

region_imputed_smoking_df.head()

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,smoke,amt_weekends,amt_weekdays,type,region_Midlands & East Anglia,region_Scotland,region_South East,region_South West,region_The North,region_Wales
1,Male,38,Divorced,No Qualification,British,White,-0.838001,No,16.410926,13.750594,Packets,0,0,0,0,1,0
2,Female,42,Single,No Qualification,British,White,-1.048523,Yes,12.0,12.0,Packets,0,0,0,0,1,0
3,Male,40,Married,Degree,English,White,1.477742,No,16.410926,13.750594,Packets,0,0,0,0,1,0
4,Female,40,Married,Degree,English,White,-0.101173,No,16.410926,13.750594,Packets,0,0,0,0,1,0
5,Female,39,Married,GCSE/O Level,British,White,-0.838001,No,16.410926,13.750594,Packets,0,0,0,0,1,0


Alright! So far we have the region column one-hot encoded, resulting in a numerical representation of a categorical column, which allows our model to process it properly. We have a couple more categorical columns to process, like gender, marital_status, qualification, ethnicity, type, and nationality.

In [257]:
# One-hot encoding all the remaining categorical columns, and placing it in the `gender_dummies` dataframe, now we create a new dataframe to process our data!
gender_dummies = pd.get_dummies(region_imputed_smoking_df,
                                columns=['gender', 'marital_status', 'highest_qualification', 'nationality',
                                         'ethnicity', 'type'])

gender_dummies['smoke'] = gender_dummies['smoke'].map(lambda x: 1 if x == 'Yes' else 0)

gender_dummies.head()

Unnamed: 0,age,gross_income,smoke,amt_weekends,amt_weekdays,region_Midlands & East Anglia,region_Scotland,region_South East,region_South West,region_The North,...,ethnicity_Black,ethnicity_Chinese,ethnicity_Mixed,ethnicity_Refused,ethnicity_Unknown,ethnicity_White,type_Both/Mainly Hand-Rolled,type_Both/Mainly Packets,type_Hand-Rolled,type_Packets
1,38,-0.838001,0,16.410926,13.750594,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
2,42,-1.048523,1,12.0,12.0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
3,40,1.477742,0,16.410926,13.750594,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,40,-0.101173,0,16.410926,13.750594,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
5,39,-0.838001,0,16.410926,13.750594,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1


In [258]:
finalized_df = gender_dummies.copy()

finalized_df.describe()

Unnamed: 0,age,gross_income,smoke,amt_weekends,amt_weekdays,region_Midlands & East Anglia,region_Scotland,region_South East,region_South West,region_The North,...,ethnicity_Black,ethnicity_Chinese,ethnicity_Mixed,ethnicity_Refused,ethnicity_Unknown,ethnicity_White,type_Both/Mainly Hand-Rolled,type_Both/Mainly Packets,type_Hand-Rolled,type_Packets
count,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,...,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0,1691.0
mean,49.836192,1.589914e-16,0.248965,16.410926,13.750594,0.261975,0.087522,0.149024,0.092844,0.251922,...,0.020106,0.015967,0.008279,0.007688,0.001183,0.922531,0.005914,0.024837,0.042578,0.926671
std,18.736851,1.000296,0.432541,4.931838,4.680237,0.439839,0.282682,0.356218,0.2903,0.434245,...,0.140406,0.125384,0.090639,0.087368,0.034381,0.267413,0.076695,0.155675,0.201964,0.260754
min,16.0,-1.048523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,34.0,-0.5222177,0.0,16.410926,13.750594,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
50%,48.0,-0.1011735,0.0,16.410926,13.750594,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,65.5,0.3198707,0.0,16.410926,13.750594,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
max,97.0,3.26718,1.0,60.0,55.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [259]:
from sklearn.feature_selection import mutual_info_classif

mutual_info = mutual_info_classif(finalized_df.drop(columns=['smoke']), finalized_df['smoke'])

cols = finalized_df.drop(columns=['smoke']).columns

mutual_info_df = pd.DataFrame({cols[0]: [0]})

for ind, each_column in enumerate(cols):
    mutual_info_df[each_column] = mutual_info[ind]

values = {}

for each_value in mutual_info_df:
    col_value = list(mutual_info_df[each_value])[0]
    values[each_value] = col_value

removal_values = []
for each_key in values:
    if values[each_key] < 0.09:
        removal_values.append(each_key)

removal_values

['age',
 'gross_income',
 'region_Midlands & East Anglia',
 'region_Scotland',
 'region_South East',
 'region_South West',
 'region_The North',
 'region_Wales',
 'gender_Female',
 'gender_Male',
 'marital_status_Divorced',
 'marital_status_Married',
 'marital_status_Separated',
 'marital_status_Single',
 'marital_status_Widowed',
 'highest_qualification_A Levels',
 'highest_qualification_Degree',
 'highest_qualification_GCSE/CSE',
 'highest_qualification_GCSE/O Level',
 'highest_qualification_Higher/Sub Degree',
 'highest_qualification_No Qualification',
 'highest_qualification_ONC/BTEC',
 'highest_qualification_Other/Sub Degree',
 'nationality_British',
 'nationality_English',
 'nationality_Irish',
 'nationality_Other',
 'nationality_Refused',
 'nationality_Scottish',
 'nationality_Unknown',
 'nationality_Welsh',
 'ethnicity_Asian',
 'ethnicity_Black',
 'ethnicity_Chinese',
 'ethnicity_Mixed',
 'ethnicity_Refused',
 'ethnicity_Unknown',
 'ethnicity_White',
 'type_Both/Mainly Hand-Roll

In [260]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

scaler = StandardScaler()

finalized_df_clone = finalized_df.copy()

train_x, test_x, train_y, test_y = train_test_split(finalized_df_clone.drop(columns=['smoke']),
                                                    finalized_df_clone['smoke'], test_size=.35)

log_model = LogisticRegression()

log_model.fit(train_x, train_y)

altered_df_clone = finalized_df_clone.copy()

altered_df_clone.drop(columns=removal_values, inplace=True)

altered_log_model = LogisticRegression()

altered_train_x, altered_test_x, altered_train_y, altered_test_y = train_test_split(
    altered_df_clone.drop(columns=['smoke']), altered_df_clone['smoke'])

altered_log_model.fit(altered_train_x, altered_train_y)

altered_log_model.score(altered_test_x, altered_test_y), log_model.score(test_x, test_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.8297872340425532, 0.8496621621621622)

In [272]:
## Let's try it with a random forest!

from sklearn.metrics import classification_report

random_forest_model = RandomForestClassifier()

random_forest_model.fit(train_x, train_y)

predictions = random_forest_model.predict(test_x)

# Using a random forest classifier, we have reached a perfect model score, something is fishy though, perfect model scores may be a result of underfitting or overfitting.
random_forest_model.score(test_x,
                          test_y), pd.DataFrame(classification_report(test_y, predictions, output_dict=True))

(1.0,
                0      1  accuracy  macro avg  weighted avg
 precision    1.0    1.0       1.0        1.0           1.0
 recall       1.0    1.0       1.0        1.0           1.0
 f1-score     1.0    1.0       1.0        1.0           1.0
 support    460.0  132.0       1.0      592.0         592.0)

Wow! So we managed to score an insanely high score using our RandomForestClassification model, we managed to achieve a perfect score across the board. This was due to one-hot encoding all the categorical variables, and standardizing the continuous variables in the model, to achieve the most efficient dataset for our model to process. Overall, this was an interesting and enlightening foray into handling categorical variables, missing values, and different type of model performances in a dataset consisting of 90% categorical variables. I imputed the values properly, and cleaned up this dataset just right for the random forest to excell in predicting the test dataset properly. Going to pickle-ize this model.

In [275]:
import pickle

filename = 'smoking_rfc.pkl'
pickle.dump(random_forest_model, open(filename, 'wb'))