# imports

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, auc, plot_roc_curve, classification_report
from math import exp
from IPython.display import display_html, display,HTML
import random
import shap


Upload data from url web address

In [61]:
csv_url = 'https://raw.githubusercontent.com/efratkohen/Big_five/master/big_five_scores.csv'
df=pd.read_csv(csv_url, index_col=0) 

In [62]:
#Add show/hide option for the jupyter notebook. press the show/hide button after running this cell.
def hide_toggle(text='Toggle', for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = text + ' show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)
hide_toggle()

# Exploratory Data Analysis

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307313 entries, 1 to 334161
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   country                  307141 non-null  object 
 1   age                      307313 non-null  int64  
 2   sex                      307313 non-null  int64  
 3   agreeable_score          307313 non-null  float64
 4   extraversion_score       307313 non-null  float64
 5   openness_score           307313 non-null  float64
 6   conscientiousness_score  307313 non-null  float64
 7   neuroticism_score        307313 non-null  float64
dtypes: float64(5), int64(2), object(1)
memory usage: 21.1+ MB


Check for NAN values

In [64]:
NAN_rows = df.shape[0] - df.dropna().shape[0]
NAN_percent = NAN_rows/df.shape[0]
print(f"The amount of rows containig nan values is {(NAN_percent * 100):.2f}% ")

The amount of rows containig nan values is 0.06% 


Drop all nan values after review that there are less than 0.1%

In [65]:
data = df.dropna()

In [66]:
data.describe()

Unnamed: 0,age,sex,agreeable_score,extraversion_score,openness_score,conscientiousness_score,neuroticism_score
count,307141.0,307141.0,307141.0,307141.0,307141.0,307141.0,307141.0
mean,25.19,1.6,0.7,0.67,0.73,0.7,0.57
std,10.0,0.49,0.09,0.11,0.09,0.11,0.13
min,10.0,1.0,0.2,0.2,0.25,0.21,0.2
25%,18.0,1.0,0.64,0.6,0.67,0.63,0.49
50%,22.0,2.0,0.7,0.68,0.74,0.71,0.57
75%,29.0,2.0,0.76,0.75,0.8,0.78,0.66
max,99.0,2.0,1.0,0.99,1.0,1.0,1.0


Check data balance between female and male

In [67]:
gender_count = data['sex'].value_counts(normalize=True)
print(f"The data containes {(gender_count.iloc[0]*100):.2f}% female's surveys and {(gender_count.iloc[1]*100):.2f}% male's surveys")

The data containes 60.26% female's surveys and 39.74% male's surveys


Pairplot visualization with femal/male color map:

In [None]:
g = sns.pairplot(data, hue='sex')
new_labels = ['Male', 'Female']
for t, l in zip(g._legend.texts, new_labels): t.set_text(l)
_=g

Box plot visualization for the five personality traits by sex:

In [None]:
personality_traits = ['agreeable_score', 'extraversion_score', 'openness_score', 'conscientiousness_score','neuroticism_score']
data1 = pd.melt(data, id_vars=['sex'], value_vars=personality_traits)
data1.loc[(data1.sex == 1),'sex'] = 'Male'
data1.loc[(data1.sex == 2),'sex'] = 'Female'
fig, ax = plt.subplots(figsize = (15, 10))
m = sns.boxplot(ax=ax, data=data1, x="variable", y="value", hue="sex")

Box plot visualization for age by sex:

In [None]:
data_age_melt = pd.melt(data, id_vars=['sex'], value_vars=['age'])
data_age_melt.loc[(data_age_melt.sex == 1),'sex'] = 'Male'
data_age_melt.loc[(data_age_melt.sex == 2),'sex'] = 'Female'
fig, ax = plt.subplots(figsize = (15, 10))
m = sns.boxplot(ax=ax, data=data_age_melt, x="variable", y="value", hue="sex")

Calaulation for the amount of countries in the data:


In [None]:
print(f"The data containes {len(data.country.unique())} different countries")

How many observations there are for each country in the data?

In [None]:
country = data.groupby(['country'])['age'].count().sort_values(ascending=True)
_=country.plot(kind='barh', figsize=(20,50))
_=plt.xlabel("Number of observations")

### Conclusions from the data exploratory:

1. Gender: The data is not balanced, there are more female observations than male.
2. Age: Most of the surveys are of young people in their twenties. It's not representative for the world's population.
3. Countries: USA has the majority of the observations (69%) and only 11 more countries has more than 0.5% of the observations.
4. Personality traits: All the personality distribution of female are a bit higher than male, espesially neuroticism and agreeable.

# Preprocessing

according to the exploratory data analysis we decided to:
1. Balance the data by gender
2. With one-hot encoding we gave a specific feature for only countries with significant amount of surveys (more than 0.5%) and all the other countries gather together to one feature 'other'.
3. Normalize the data
4. Add new features of multiplication of any two personality scores.
5. We tried to use PCA for dimensionality reduction, due to simalarity of personality traits, but decided not to use it has the results didn't show beneficial output. 
6. We tried to use K-means in order to see if there is a 'cultural' diversety (by countries regions), but decided not to use it has the results didn't show beneficial output. 

We decided to run the models on three different 'data' in order to understand the preprocessing influence on the results:
1. raw_data: the original data without NaN and with significant countries one-hot encoding and normalization.
2. balanced_data: downsampled femal's observations from raw_data to match population's gender distribution.
3. balanced_interacted_data: balanced_data with new features of multiplication of any two personality scores.

### raw data

Narrow countries to only ones with more than 0.5% percent of the data.


In [None]:
data_lenght = len(data)
significant_percent = 0.005
significant_observations = significant_percent * data_lenght
country[country > significant_observations]

In [None]:
country_list = country[country > significant_observations].index.values.tolist()

Build Dataframes of chosen country binary variables and join with data dataframe

In [None]:
countries = pd.get_dummies(data['country'][data['country'].isin(country_list)])
data_country = pd.concat([data, countries] ,axis=1)
data_country.insert(20, 'Other', np.where(np.isnan(data_country['UK'].values), 1, 0)) #add 'other' country column
raw_data = data_country.fillna(0).drop(['country'], axis=1)

In [None]:
raw_data.head()

Normalize features:

In [None]:
scaler = MinMaxScaler()
names = raw_data.columns
d = scaler.fit_transform(raw_data)
scaled_raw_data = pd.DataFrame(d, index=raw_data.index, columns=names)

### spliting the data to train and test 

In [None]:
target_name = 'sex'
scaled_raw_train, scaled_raw_test = train_test_split(scaled_raw_data)

In [None]:
scaled_raw_train.info()

In [None]:
scaled_raw_test.info()

### balanced_data

World's population is 50.5% male and 49.5% females (https://countrymeters.info/en/World). The original data containes 60.26% female's surveys and 39.74% male's surveys. Therefore we undersample femal's observations from our train data.

In [None]:
minority_class = scaled_raw_train['sex'].value_counts()[0]
print(f"The scaled_raw_train data containes {(scaled_raw_train['sex'].value_counts()[1])} female's surveys and {(scaled_raw_train['sex'].value_counts()[0])} male's surveys")

In [None]:
# Shuffle the Dataset.
shuffled_df = scaled_raw_train.sample(frac=1,random_state=4)

# Put all the male class in a separate dataset.
male_df = shuffled_df.loc[shuffled_df['sex'] == 0] #0= male, 1= female

#Randomly select minority_class number observations from the female (majority class)
female_df = shuffled_df.loc[shuffled_df['sex'] == 1].sample(n=minority_class,random_state=42)

# Concatenate both dataframes again
balanced_train = pd.concat([male_df, female_df])

#plot the dataset after undersampling
plt.figure(figsize=(8, 8))
sns.countplot('sex', data=balanced_train)
plt.title('Balanced Classes')
plt.show()

### balanced_interacted_data

Add new features of multiplication of any two personality scores

In [None]:
Personality_Traits_list = ['agreeable_score', 'extraversion_score', 'openness_score', 'conscientiousness_score', 'neuroticism_score']

In [None]:
#Run on Personality_Traits and multiply scores of any two pairs
def feature_interactions(data: pd.DataFrame, Personality_Traits_list: list):
    """
    multiply any two columns of Personality_Traits and save the result in a new column.

    Parameters
    ---------
    data: pd.DataFrame
    Personality_Traits_list: list
        Features's list of Personality_Traits
    
    return
    
    interacted_data :pd.DataFrame
    """
    interacted_data = data.copy()
    i = 0
    j = i+1
    while i < (len(Personality_Traits_list)-1):
        # new feature name
        new_feature = Personality_Traits_list[i] + '*' + Personality_Traits_list[j]
        # multiply personality score of index i with personality score of index j=i+1
        interacted_data[new_feature] = interacted_data[Personality_Traits_list[i]] * interacted_data[Personality_Traits_list[j]] 
        j = j+1
        # check for end of list
        if j == len(Personality_Traits_list):
            i= i+1
            j=i+1
    #normalize the data:
    names = raw_data.columns
    d = scaler.fit_transform(interacted_data)
    interacted_data = pd.DataFrame(d, index=interacted_data.index, columns=names)
    
    return interacted_data

In [None]:
balanced_interacted_train = feature_interactions(balanced_train, Personality_Traits_list)
interacted_test = feature_interactions(scaled_raw_test, Personality_Traits_list) #the test data was not balanced, we only add the interaction columns

In [None]:
balanced_interacted_train

In [None]:
interacted_test

In the preprocessing we also tried:
K-Mean, outliers handeling and PCA
but eventually decided not to use them, because it didn't help the prediction

In [None]:
#PCA
def pca_plot(data: pd.DataFrame, features: list, color_col: str ="sex"):
    """
    Plots the PCA as desired. 

    Parameters
    ---------
    data: pd.DataFrame
    features: list
        Features's list that we would like to reduce dimension
    ax_i: plt.axes
    color_col: str
        name of column to color the dots by.
    """
    x_only = data_country[features]

    pca_model = make_pipeline(StandardScaler(), PCA(n_components=2))
    pca_model.fit(x_only)

    X_2D = pca_model.transform(x_only)
    pca_dict = dict(PCA1=X_2D[:, 0], PCA2=X_2D[:, 1])
    pca_results = pd.DataFrame(pca_dict)

    color_series = data.loc[:, (color_col)].reset_index(drop=True)

    pca_results["color"] = color_series

    g = sns.scatterplot(data=pca_results, x="PCA1", y="PCA2", hue="color")
    g.legend_.remove()
    g.set(title=f"PCA of {features} colored by {color_col}")
_=pca_plot(scaled_raw_data, Personality_Traits_list)
hide_toggle('PCA')

In [None]:
#Kmeans
Kmeans = KMeans(n_clusters=3)
Kmeans.fit(scaled_raw_data[Personality_Traits_list])
y_km = Kmeans.fit_predict(scaled_raw_data[Personality_Traits_list])
new_series = pd.Series(y_km, index=scaled_raw_data.index, name='cluster')
scaled_raw_data_Kmeans = pd.concat([scaled_raw_data, new_series] ,axis=1)
#Evaluation of the cluster result 
scaled_raw_data_Kmeans.groupby(by=["cluster", "sex"]).median()
scaled_raw_data_Kmeans.groupby(by=["cluster", "sex"]).count()
scaled_raw_data_Kmeans.groupby(by=["cluster", "sex"]).mean()
scaled_raw_data_Kmeans.groupby(by=["cluster", "sex"]).sum()
hide_toggle('Kmeans')

List of the three trained data:

In [None]:
train_datasets = [scaled_raw_train, balanced_train, balanced_interacted_train]

# Models

We tried few models (Logistic regression, Linear regression, Decision trees, Gradient Boosting).
the best results were given by Logistic regression and  Gradient Boosting, we will show them here:

### Logistic regression

In [None]:
target_name = 'sex'

models_list = []
X_test_list = []
y_test_list = []
y_predicted_list = []
X_train_list = []
datasets_names = ['scaled_raw', 'balanced', 'balanced_interacted']

Run Logistic regression model on the three datas:

In [None]:
for train_data in train_datasets:
    
    #split train and test to X and y:
    X_train = train_data.drop(columns=[target_name])
    y_train = train_data[target_name].copy() 
    X_train_list.append(X_train)
    #fit the model according to train data
    Lreg = LogisticRegression(max_iter=100000).fit(X_train, y_train)
    #save model to model list
    models_list.append(Lreg)
    
    #different kind of test data because of the added columns
    if (train_data.equals(balanced_interacted_train)):

        X_test = interacted_test.drop(columns=[target_name])
        y_test = interacted_test[target_name].copy()   
    else:
        #first two options
        X_test = scaled_raw_test.drop(columns=[target_name])
        y_test = scaled_raw_test[target_name].copy()   
        
    X_test_list.append(X_test)
    y_test_list.append(y_test)

    
    #predict 
    y_predicted = Lreg.predict(X_test)
    y_predicted_list.append(y_predicted)
    

Plot results of Report, ROC curve and confusion_matrix for the three options:

In [None]:
target_names = ['Male', 'Female']


# Creat plots for comparison
fig_roc, ax_roc = plt.subplots(nrows=1, ncols=3, figsize=(30,13 ))
fig_cm, ax_cm = plt.subplots(nrows=1, ncols=3, figsize=(25, 10))


for i in range(len(train_datasets)):
    #roc curve
    _ = plot_roc_curve(models_list[i], X_test_list[i], y_test_list[i],
                       ax=ax_roc[i])
    

for i in range(len(train_datasets)):
    #confusion matrix
    disp = plot_confusion_matrix(models_list[i],
                             X_test_list[i],
                             y_test_list[i],
                             display_labels=['Male', 'Female'],
                             cmap=plt.cm.Blues,
                             normalize="true", ax = ax_cm[i], colorbar=True)
        

    

report_list = []
# pd.options.display.float_format = "{:,.2f}".format


for i in range(len(train_datasets)):
    #classification report
    report = classification_report(y_test_list[i], y_predicted_list[i], target_names=target_names, output_dict=True, digits=2)
    report_df = pd.DataFrame(report).transpose()
    report_list.append(report_df)


df1_styler = report_list[0].style.set_table_attributes("style='display:inline'").set_precision(2).set_properties(**{
#     'background-color': 'grey',
    'font-size': '8pt',
}).set_caption(datasets_names[0])
df2_styler = report_list[1].style.set_table_attributes("style='display:inline'").set_precision(2).set_properties(**{
#     'background-color': 'grey',
    'font-size': '8pt',
}).set_caption(datasets_names[1])
df3_styler = report_list[2].style.set_table_attributes("style='display:inline'").set_precision(2).set_properties(**{
#     'background-color': 'grey',
    'font-size': '8pt',
}).set_caption(datasets_names[2])



display_html(df1_styler._repr_html_()+df2_styler._repr_html_()+df3_styler._repr_html_(), raw=True)
    
    
hide_toggle('Plots')
    
    

### Gradient Boosting

Run Gradiant Boosting model on the three datas:

In [None]:
models_list_GB = []
y_predicted_list_GB = []

In [None]:
for train_data in train_datasets:
    
    #split train and test to X and y:
    X_train = train_data.drop(columns=[target_name])
    y_train = train_data[target_name].copy()   
    #fit the model according to train data
    gradient_boosting  = GradientBoostingClassifier(random_state =0).fit(X_train, y_train)
    #save model to model list
    models_list_GB.append(gradient_boosting)
    
    #different kind of test data because of the added columns
    if (train_data.equals(balanced_interacted_train)):

        X_test = interacted_test.drop(columns=[target_name])
        y_test = interacted_test[target_name].copy()   
    else:
        #first two options
        X_test = scaled_raw_test.drop(columns=[target_name])
        y_test = scaled_raw_test[target_name].copy()   
        
    #predict 
    y_predicted = gradient_boosting.predict(X_test)
    y_predicted_list_GB.append(y_predicted)

Plot results of Report, ROC curve and confusion_matrix for the three options:

In [None]:
for i in range(len(train_datasets)):
    print("Plots for ", datasets_names[i])
    _ = plot_roc_curve(models_list[i], X_test_list[i], y_test_list[i])
    report = classification_report(y_test_list[i], y_predicted_list[i], target_names=target_names)
    print(report)
    disp = plot_confusion_matrix(models_list[i],
                                 X_test_list[i],
                                 y_test_list[i],
                                 display_labels=['Male', 'Female'],
                                 cmap=plt.cm.Blues,
                                 normalize="true")
    plt.show()

We can see that balancing the data had the most significant influance on the results - it improved the male prediction.
If the female's recall is more important than the man's recall - the raw data will be preferable.
The interacted features didn't improve the prediction.

# Explainability

### Logistic Regression model - explainability

Calaulate Logistic Regression explainability by odds ratio equation:
$\frac{Odds_{X_{j+1}}}{Odds_{X_{j}}} = e^{β_{j}}$

In [None]:
def explainability_Logistic_Regression (coef: np.array, delta: float):
    value = []
    for i in range(coef.shape[1]):
        value.append(exp(Lreg.coef_[0][i]*delta))
    return value

Plot explainability bar graph for each data option:

In [None]:
for i in range(len(models_list)):
    coef = models_list[i].coef_
    Explainability = explainability_Logistic_Regression(coef, 0.3) #delta is positive - probability for femal
    Explainability_df = pd.Series(np.array(Explainability),  index=X_test_list[i].columns.to_list(), name='Explainability')
    _=Explainability_df.nlargest(20).sort_values().plot(kind='barh')
    plt.title(datasets_names[i])
    plt.xlabel("Feature importance")
    plt.show()

### Gradient Boosting model - Shap explainability

Calaulate and plot Gradient Boosting explainability by shap for each data option:

In [None]:
for i in range(len(models_list)):
    print(f"                                    {datasets_names[i]}")
    explainer = shap.Explainer(models_list_GB[i])
    shap_values = explainer(X_train_list[i])
    shap.plots.bar(shap_values)
    shap.plots.beeswarm(shap_values)
    plt.show()