In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
df = pd.read_excel('../0_data/data_model.xlsx')
df.head()

Unnamed: 0,Long,Face,Skin_pct,Auto_caption,id,Short,link,Objectify (yes or no),caption,Has_women,Key_path,No_humans
0,2015-11-18_18-26-59_UTC,1,0.042849,a picture of a woman with long black hair sits...,207,-PHbiexlr_,https://www.instagram.com/p/-PHbiexlr_/?igshid...,1,Stunning Mariana Rodriguez in #amencouture. Se...,1,../1_download_data/data/raw/-PHbiexlr_\2015-11...,0
1,2015-06-01_07-45-08_UTC,1,0.491443,a picture of a woman with black hair and a bla...,214,3YO1o0Rlkt,https://www.instagram.com/p/3YO1o0Rlkt/?igshid...,1,@emastokholma in Amen embroidered #dress from ...,1,../1_download_data/data/raw/3YO1o0Rlkt\2015-06...,0
2,2015-07-06_10-38-30_UTC,1,0.133582,a picture of a woman with black hair and a bla...,213,4yqgDyxloK,https://www.instagram.com/p/4yqgDyxloK/?igshid...,1,Stunning DJ @emastokholma wears all over paill...,1,../1_download_data/data/raw/4yqgDyxloK\2015-07...,0
3,2020-04-10_08-57-20_UTC,0,0.258619,a picture of a woman wearing a black jacket an...,209,B-y2Fo8qcN_,https://www.instagram.com/p/B-y2Fo8qcN_/?igshi...,1,Crystal Pumps ✨ #AmenStyle #AmenPreSpring20 #A...,1,../1_download_data/data/raw/B-y2Fo8qcN_\2020-0...,0
4,2019-10-24_11-49-40_UTC,1,0.888304,a picture of a woman in a red dress sits on a ...,200,B3__d73i46x,https://www.instagram.com/p/B3__d73i46x/?igshi...,1,Discover our FW19 Collection on amenstyle.com ...,1,../1_download_data/data/raw/B3__d73i46x\2019-1...,0


In [3]:
df.rename(columns = {'Objectify (yes or no)':'Objectify'}, inplace = True)

In [4]:
df.describe()

Unnamed: 0,Face,Skin_pct,id,Objectify,Has_women,No_humans
count,661.0,661.0,661.0,661.0,661.0,661.0
mean,0.881997,0.254157,149.216339,0.695915,1.0,0.0
std,0.322856,0.213565,82.665152,0.460367,0.0,0.0
min,0.0,0.0,1.0,0.0,1.0,0.0
25%,1.0,0.086044,78.0,0.0,1.0,0.0
50%,1.0,0.197287,154.0,1.0,1.0,0.0
75%,1.0,0.371213,222.0,1.0,1.0,0.0
max,1.0,0.973183,283.0,1.0,1.0,0.0


In [5]:
X = df[['Skin_pct', 'Face']]
y = df['Objectify']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# Support Vector Machine
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_pred = dtree.predict(X_test)

# Random Forest
rforest = RandomForestClassifier()
rforest.fit(X_train, y_train)
rforest_pred = rforest.predict(X_test)

# Gradient Boosting
gboost = GradientBoostingClassifier()
gboost.fit(X_train, y_train)
gboost_pred = gboost.predict(X_test)

# Example of evaluating the models using accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("KNN Accuracy:", accuracy_score(y_test, knn_pred))
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("Decision Tree Accuracy:", accuracy_score(y_test, dtree_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rforest_pred))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gboost_pred))


Logistic Regression Accuracy: 0.6633165829145728
KNN Accuracy: 0.5979899497487438
SVM Accuracy: 0.6733668341708543
Decision Tree Accuracy: 0.5829145728643216
Random Forest Accuracy: 0.5778894472361809
Gradient Boosting Accuracy: 0.6080402010050251


## Use Word Embeddings

In [7]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

  "class": algorithms.Blowfish,
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Download the Word2Vec Embeddings and save them locally. Not in this folder because github does not support large files in the free trial
GoogleNews-vectors-negative300.bin
path to download = https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300

In [8]:
model = KeyedVectors.load_word2vec_format('../../../Embeddings/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True) # have embeddings in pc

In [9]:
def sentence_to_avg_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)

In [10]:
df['sentence_vector'] = df['Auto_caption'].apply(lambda x: sentence_to_avg_vector(x, model))
# Assuming you want to use these vectors in a machine learning model
X_embeddings = np.array(df['sentence_vector'].tolist())  # Feature matrix

In [11]:
X_combined = np.concatenate((X_embeddings, X), axis=1)
X_combined

array([[ 4.27223481e-02,  3.72857004e-02, -2.62578321e-03, ...,
        -4.93028434e-03,  4.28485870e-02,  1.00000000e+00],
       [ 3.92417908e-02,  6.11674003e-02, -1.57318115e-02, ...,
        -5.69152832e-03,  4.91442871e-01,  1.00000000e+00],
       [ 5.21591194e-02,  4.77233902e-02,  9.75036621e-03, ...,
         1.49719240e-02,  1.33582176e-01,  1.00000000e+00],
       ...,
       [ 5.52016683e-02,  3.56270932e-02,  2.24958151e-03, ...,
         4.49044368e-04,  2.44262356e-01,  1.00000000e+00],
       [ 1.91882905e-02,  5.26588075e-02, -1.14281066e-02, ...,
         5.10428287e-02,  2.20332031e-01,  1.00000000e+00],
       [ 4.89985161e-02,  5.01327515e-02, -2.30077114e-02, ...,
         4.17919159e-02,  3.72958984e-01,  1.00000000e+00]])

# Classificator with embeddings

In [12]:
X0 = X_combined
Y = df['Objectify']
X_train, X_test, Y_train, Y_test = train_test_split(X0, Y, test_size=0.3, random_state=42)

In [13]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# Support Vector Machine
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_pred = dtree.predict(X_test)

# Random Forest
rforest = RandomForestClassifier()
rforest.fit(X_train, y_train)
rforest_pred = rforest.predict(X_test)

# Gradient Boosting
gboost = GradientBoostingClassifier()
gboost.fit(X_train, y_train)
gboost_pred = gboost.predict(X_test)

# Example of evaluating the models using accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("KNN Accuracy:", accuracy_score(y_test, knn_pred))
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("Decision Tree Accuracy:", accuracy_score(y_test, dtree_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rforest_pred))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gboost_pred))

Logistic Regression Accuracy: 0.6884422110552764
KNN Accuracy: 0.7185929648241206
SVM Accuracy: 0.6633165829145728
Decision Tree Accuracy: 0.5577889447236181
Random Forest Accuracy: 0.7185929648241206
Gradient Boosting Accuracy: 0.7035175879396985


In [17]:
y.mean()

0.6959152798789713

See also F1-score to evaluate best model because dataset is unbalanced

In [14]:
from sklearn.metrics import f1_score

# Calculate F1 scores
f1_logreg = f1_score(y_test, logreg_pred)
f1_knn = f1_score(y_test, knn_pred)
f1_svm = f1_score(y_test, svm_pred)
f1_dtree = f1_score(y_test, dtree_pred)
f1_rforest = f1_score(y_test, rforest_pred)
f1_gboost = f1_score(y_test, gboost_pred)

# Print F1 scores
print("Logistic Regression F1 Score:", f1_logreg)
print("KNN F1 Score:", f1_knn)
print("SVM F1 Score:", f1_svm)
print("Decision Tree F1 Score:", f1_dtree)
print("Random Forest F1 Score:", f1_rforest)
print("Gradient Boosting F1 Score:", f1_gboost)

Logistic Regression F1 Score: 0.8074534161490684
KNN F1 Score: 0.8082191780821918
SVM F1 Score: 0.797583081570997
Decision Tree F1 Score: 0.653543307086614
Random Forest F1 Score: 0.8181818181818182
Gradient Boosting F1 Score: 0.802675585284281


Having the best model we can automatically label all the other data

In [None]:
# import other data