In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
import os

In [2]:
path = os.getcwd() + "/Pickles/refined_profiles.pkl"
print(path)
with open(path,'rb') as fp:
    df = pickle.load(fp)

# Viewing the DF    
df

/Users/dianaomomehin/Desktop/AI Matchmaker/Pickles/refined_profiles.pkl


Unnamed: 0,Profiles,Style,Age,Gender
0,"Feminine, Boho, Preppy, Androgynous","[Edgy, Masculine]",18,Female
1,"Boho, Classic, Maximalist, Masculine","[Boho, Romantic]",20,Non-Binary
2,"Masculine, Glam, Romantic, Sexy","[Streetwear, Feminine, Masculine]",22,Female
3,"Androgynous, Glam, Streetwear, Dramatic","[Casual, Romantic, Editorial]",21,Male
4,"Androgynous, Glam, Dramatic, Romantic","[Preppy, Masculine]",29,Non-Binary
...,...,...,...,...
73435,"Maximalist, Elegant, Boho, Dramatic","[Classic, Editorial]",23,Male
73436,"Elegant, Editorial, Androgynous, Preppy","[Preppy, Feminine, Elegant]",20,Female
73437,"Editorial, Casual, Edgy, Boho","[Androgynous, Feminine, Minimalist]",23,Male
73438,"Streetwear, Dramatic, Classic, Masculine","[Boho, Maximalist, Androgynous]",20,Male


## Clustering the Data

In [3]:
def string_convert(x):
    """
    First converts the lists in the DF into strings
    """
    if isinstance(x, list):
        return ' '.join(x)
    else:
        return x
    
# Looping through the columns and applying the function
for col in df.columns:
    df[col] = df[col].apply(string_convert)

In [4]:
df = df[['Profiles', 'Style','Gender','Age']]
df

Unnamed: 0,Profiles,Style,Gender,Age
0,"Feminine, Boho, Preppy, Androgynous",Edgy Masculine,Female,18
1,"Boho, Classic, Maximalist, Masculine",Boho Romantic,Non-Binary,20
2,"Masculine, Glam, Romantic, Sexy",Streetwear Feminine Masculine,Female,22
3,"Androgynous, Glam, Streetwear, Dramatic",Casual Romantic Editorial,Male,21
4,"Androgynous, Glam, Dramatic, Romantic",Preppy Masculine,Non-Binary,29
...,...,...,...,...
73435,"Maximalist, Elegant, Boho, Dramatic",Classic Editorial,Male,23
73436,"Elegant, Editorial, Androgynous, Preppy",Preppy Feminine Elegant,Female,20
73437,"Editorial, Casual, Edgy, Boho",Androgynous Feminine Minimalist,Male,23
73438,"Streetwear, Dramatic, Classic, Masculine",Boho Maximalist Androgynous,Male,20


In [5]:
def vectorization(df, columns):
    """
    Using recursion, iterate through the df until all the categories have been vectorized
    """
    column_name = columns[0]
    
    # Checking if the column name has been removed already
    if column_name not in ['Profiles', 'Style', 'Gender']:
        return df
    
    else:
        # Instantiating the Vectorizer
        vectorizer = CountVectorizer()
        
        # Fitting the vectorizer to the Bios
        x = vectorizer.fit_transform(df[column_name])

        # Creating a new DF that contains the vectorized words
        df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())

        # Concating the words DF with the original DF
        new_df = pd.concat([df, df_wrds], axis=1)

        # Dropping the column because it is no longer needed in place of vectorization
        new_df = new_df.drop(column_name, axis=1)
        
        return vectorization(new_df, new_df.columns) 

In [6]:
# Creating the vectorized DF
vect_df = vectorization(df, df.columns)
vect_df

Unnamed: 0,Age,androgynous,boho,casual,classic,dramatic,edgy,editorial,elegant,feminine,...,minimalist,preppy,romantic,sexy,streetwear,vintage,binary,female,male,non
0,18,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,20,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
2,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,21,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,29,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73435,23,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
73436,20,1,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,1,0,0
73437,23,0,1,1,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,1,0
73438,20,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
scaler = MinMaxScaler()

vect_df = pd.DataFrame(scaler.fit_transform(vect_df), index=vect_df.index, columns=vect_df.columns)

In [8]:
vect_df

Unnamed: 0,Age,androgynous,boho,casual,classic,dramatic,edgy,editorial,elegant,feminine,...,minimalist,preppy,romantic,sexy,streetwear,vintage,binary,female,male,non
0,0.000000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.058824,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.088235,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.323529,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73435,0.147059,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
73436,0.058824,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
73437,0.147059,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
73438,0.058824,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## PCA

In [9]:
from sklearn.decomposition import PCA

# Performing dimension reduction
# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(vect_df)

# Finding the exact number of features to maximize the variance in the data
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_9 = len(total_explained_variance[total_explained_variance>=.99])
n_to_reach_9 = vect_df.shape[1] - n_over_9

print("PCA reduces the # of features from", vect_df.shape[1], 'to', n_to_reach_9)

# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_9)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(vect_df)

# Seeing the variance ratio that still remains after the dataset has been reduced
pca.explained_variance_ratio_.cumsum()[-1]

PCA reduces the # of features from 41 to 35


0.9742948001271149

### Using Hierarchical Agglomerative Clustering


In [10]:
# Setting the amount of clusters to test out
cluster_cnt = [i for i in range(2, 11, 1)]

# Establishing empty lists to store the scores for the evaluation metrics
ch_scores = []

s_scores = []

db_scores = []

# The DF for evaluation
eval_df = df_pca

# Looping through different iterations for the number of clusters
for i in tqdm(cluster_cnt):
    
    # Clustering with different number of clusters
    clust = AgglomerativeClustering(n_clusters=i, linkage='complete')
    
    clust.fit(eval_df)
    
    cluster_assignments = clust.labels_
    
    # Appending the scores to the empty lists
    ch_scores.append(calinski_harabasz_score(eval_df, cluster_assignments))
    
    s_scores.append(silhouette_score(eval_df, cluster_assignments))
    
    db_scores.append(davies_bouldin_score(eval_df, cluster_assignments))

  0%|          | 0/9 [00:00<?, ?it/s]

In [1]:
def cluster_eval(y, x):
    """
    Prints the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    print('Max Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    print('\nMin Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].min()])
    
    # Plotting out the scores based on cluster count
    plt.figure(figsize=(16,6))
    plt.style.use('bmh')
    plt.plot(x,y)
    plt.xlabel('# of Clusters')
    plt.ylabel('Score')
    plt.show()

Cluster Evaluation

In [2]:
print("The Calinski-Harabasz Score (find max score):")
cluster_eval(ch_scores, cluster_cnt)

print("\nThe Silhouette Coefficient Score (find max score):")
cluster_eval(s_scores, cluster_cnt)

print("\nThe Davies-Bouldin Score (find minimum score):")
cluster_eval(db_scores, cluster_cnt)

The Calinski-Harabasz Score (find max score):


NameError: ignored

### Running HAC using optimized stuff


In [None]:
# Instantiating HAC based on the optimum number of clusters found
hac = AgglomerativeClustering(n_clusters="insert number here", linkage='complete')

# Fitting
hac.fit(df_pca)

# Getting cluster assignments
cluster_assignments = hac.labels_

# Assigning the clusters to each profile
df['Cluster #'] = cluster_assignments

vect_df['Cluster #'] = cluster_assignments

In [None]:
with open("refined_cluster.pkl",'wb') as fp:
    pickle.dump(df, fp)
    
with open("vectorized_refined.pkl", 'wb') as fp:
    pickle.dump(vect_df, fp)

Matching new profile to previous ones

In [None]:
# Importing 3 models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Perform training, splitting and testing
#  Assigning the split variables
X = vect_df.drop(["Cluster #"], 1)
y = vect_df['Cluster #']

# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Finding the Best Model
- Dummy (Baseline Model)
- KNN
- SVM
- NaiveBayes
- Logistic Regression
- Adaboost

In [None]:
# Dummy
dummy = DummyClassifier(strategy='stratified')

# KNN
knn = KNeighborsClassifier()

# SVM
svm = SVC(gamma='scale')

# NaiveBayes
nb = ComplementNB()

# Logistic Regression
lr = LogisticRegression()

# Adaboost
adab = AdaBoostClassifier()

# List of models
models = [dummy, knn, svm, nb, lr, adab]

# List of model names
names = ['Dummy', 'KNN', 'SVM', 'NaiveBayes', 'Logistic Regression', 'Adaboost']

# Zipping the lists
classifiers = dict(zip(names, models))

In [None]:
# Visualization of the different cluster counts
vect_df['Cluster #'].value_counts().plot('pie', title='Count of Class Distribution');

Since we are dealing with an imbalanced dataset _(because each cluster is not guaranteed to have the same amount of profiles)_, we will resort to using the __Macro Avg__ and __F1 Score__ for evaluating the performances of each model.

In [None]:
# Dictionary containing the model names and their scores
models_f1 = {}

# Looping through each model's predictions and getting their classification reports
for name, model in tqdm(classifiers.items()):
    # Fitting the model
    model.fit(X_train, y_train)
    
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = classification_report(y_test, model.predict(X_test), output_dict=True)
    f1 = report['macro avg']['f1-score']
    
    # Assigning to the Dictionary
    models_f1[name] = f1
    
    print(f1)

Finding model with the best performance

In [None]:
print(max(models_f1, key=models_f1.get), 'Score:', max(models_f1.values()))

In [None]:
# Fitting the model
# nb.fit(X, y)
# svm.fit(X,y) etc

Saving the model

In [None]:
from joblib import dump

dump(nb, "refined_model.joblib")