In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
df = pd.read_excel('../0_data/gold/8_data_model.xlsx')
df.head()

Unnamed: 0,img_path,face,skin,text_deep_clean,objectifies
0,../1_download_data/jacquemus\2022-05-30_15-43-...,0,0.332761,jacquemus sydney hawai i night tom woman long ...,1
1,../1_download_data/jacquemus\2020-11-12_17-00-...,1,0.138254,jacquemus l annee fw shoot rosa picture woman ...,0
2,../1_download_data/hm\2023-09-24_18-34-48_UTC\...,0,0.002294,handbag hold close woman wear black dress blac...,1
3,../1_download_data/jacquemus\2022-03-23_16-47-...,0,0.125818,jacquemus le sac rond so happy have work iconi...,1
4,../1_download_data/chanelofficial\2023-12-08_1...,1,0.145028,chanelofficial opening chanel metier art show ...,0


In [3]:
df.describe()

Unnamed: 0,face,skin,objectifies
count,1587.0,1587.0,1587.0
mean,0.908003,0.228274,0.412098
std,0.289113,0.213313,0.492368
min,0.0,0.0,0.0
25%,1.0,0.065017,0.0
50%,1.0,0.159732,0.0
75%,1.0,0.336554,1.0
max,1.0,0.9996,1.0


In [4]:
X = df[['skin', 'face']]
y = df['objectifies']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# Support Vector Machine
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_pred = dtree.predict(X_test)

# Random Forest
rforest = RandomForestClassifier()
rforest.fit(X_train, y_train)
rforest_pred = rforest.predict(X_test)

# Gradient Boosting
gboost = GradientBoostingClassifier()
gboost.fit(X_train, y_train)
gboost_pred = gboost.predict(X_test)

# Example of evaluating the models using accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("KNN Accuracy:", accuracy_score(y_test, knn_pred))
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("Decision Tree Accuracy:", accuracy_score(y_test, dtree_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rforest_pred))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gboost_pred))


Logistic Regression Accuracy: 0.6331236897274634
KNN Accuracy: 0.5723270440251572
SVM Accuracy: 0.6226415094339622
Decision Tree Accuracy: 0.5241090146750524
Random Forest Accuracy: 0.5241090146750524
Gradient Boosting Accuracy: 0.59958071278826


## Use Word Embeddings

In [6]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

  "class": algorithms.Blowfish,
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Download the Word2Vec Embeddings and save them locally. Not in this folder because github does not support large files in the free trial
GoogleNews-vectors-negative300.bin
path to download = https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300

In [7]:
model = KeyedVectors.load_word2vec_format('../../../Embeddings/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True) # have embeddings in pc

In [8]:
def sentence_to_avg_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)

In [9]:
df['sentence_vector'] = df['text_deep_clean'].apply(lambda x: sentence_to_avg_vector(x, model))
# Assuming you want to use these vectors in a machine learning model
X_embeddings = np.array(df['sentence_vector'].tolist())  # Feature matrix

In [10]:
X_combined = np.concatenate((X_embeddings, X), axis=1)
X_combined

array([[-1.28784180e-02,  4.12375703e-02, -8.36736523e-03, ...,
         3.70955020e-02,  3.32760893e-01,  0.00000000e+00],
       [ 1.17519209e-02, -1.64370332e-02,  9.43523925e-03, ...,
         4.52920683e-02,  1.38254458e-01,  1.00000000e+00],
       [ 3.10849268e-02,  3.15718213e-03, -4.59345020e-02, ...,
        -8.76686769e-04,  2.29355281e-03,  0.00000000e+00],
       ...,
       [-1.88683402e-02, -4.65901708e-03, -2.67808698e-02, ...,
        -2.97105573e-02,  9.99600309e-01,  0.00000000e+00],
       [-2.45157885e-03,  7.34268203e-02, -1.33702597e-02, ...,
        -1.53767904e-02,  2.97908093e-02,  1.00000000e+00],
       [ 3.72042656e-02, -1.88064575e-03, -3.41339111e-02, ...,
         6.53152466e-02,  1.18575488e-01,  1.00000000e+00]])

# Classificator with embeddings

In [11]:
X = df[['skin', 'face']]
y = df['objectifies']
model = model = KeyedVectors.load_word2vec_format('../../../Embeddings/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True) # have embeddings in pc
def sentence_to_avg_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)
df['sentence_vector'] = df['text_deep_clean'].apply(lambda x: sentence_to_avg_vector(x, model))
# Assuming you want to use these vectors in a machine learning model
X_embeddings = np.array(df['sentence_vector'].tolist())  # Feature matrix
X_combined = np.concatenate((X_embeddings, X), axis=1)
X_combined

X0 = X_combined
Y = df['objectifies']
X_train, X_test, Y_train, Y_test = train_test_split(X0, Y, test_size=0.3, random_state=42)

In [12]:
'''X = df[['skin', 'face']]
y = df['objectifies']
model  = KeyedVectors.load_word2vec_format('../../../Embeddings/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True) # have embeddings in pc
def sentence_to_avg_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)
df['sentence_vector'] = df['text_deep_clean'].apply(lambda x: sentence_to_avg_vector(x, model))
# Assuming you want to use these vectors in a machine learning model
X_embeddings = np.array(df['sentence_vector'].tolist())  # Feature matrix
X_combined = np.concatenate((X_embeddings, X), axis=1)
X_combined

X0 = X_combined
Y = df['objectifies']
X_train, X_test, Y_train, Y_test = train_test_split(X0, Y, test_size=0.3, random_state=42)'''


# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# Support Vector Machine
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_pred = dtree.predict(X_test)

# Random Forest
rforest = RandomForestClassifier()
rforest.fit(X_train, y_train)
rforest_pred = rforest.predict(X_test)

# Gradient Boosting
gboost = GradientBoostingClassifier()
gboost.fit(X_train, y_train)
gboost_pred = gboost.predict(X_test)

# Example of evaluating the models using accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("KNN Accuracy:", accuracy_score(y_test, knn_pred))
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("Decision Tree Accuracy:", accuracy_score(y_test, dtree_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rforest_pred))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gboost_pred))

Logistic Regression Accuracy: 0.6771488469601677
KNN Accuracy: 0.6813417190775681
SVM Accuracy: 0.6813417190775681
Decision Tree Accuracy: 0.5953878406708596
Random Forest Accuracy: 0.6834381551362684
Gradient Boosting Accuracy: 0.6960167714884696


In [13]:
y.mean()

0.4120982986767486

See also F1-score to evaluate best model because dataset is unbalanced

In [14]:
from sklearn.metrics import f1_score

# Calculate F1 scores
f1_logreg = f1_score(y_test, logreg_pred)
f1_knn = f1_score(y_test, knn_pred)
f1_svm = f1_score(y_test, svm_pred)
f1_dtree = f1_score(y_test, dtree_pred)
f1_rforest = f1_score(y_test, rforest_pred)
f1_gboost = f1_score(y_test, gboost_pred)

# Print F1 scores
print("Logistic Regression F1 Score:", f1_logreg)
print("KNN F1 Score:", f1_knn)
print("SVM F1 Score:", f1_svm)
print("Decision Tree F1 Score:", f1_dtree)
print("Random Forest F1 Score:", f1_rforest)
print("Gradient Boosting F1 Score:", f1_gboost)

Logistic Regression F1 Score: 0.5304878048780488
KNN F1 Score: 0.6041666666666666
SVM F1 Score: 0.5096774193548387
Decision Tree F1 Score: 0.5089058524173027
Random Forest F1 Score: 0.5465465465465466
Gradient Boosting F1 Score: 0.5915492957746478


Having the best model we can automatically label all the other data

# See feature importance on objectification

In [15]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize

# Your existing code for data preparation...

# Train an XGBoost model
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, Y_train)
xgboost_pred = xgboost_model.predict(X_test)

# Evaluate the XGBoost model
print("XGBoost Accuracy:", accuracy_score(Y_test, xgboost_pred))

# Extract feature importances
feature_importances = xgboost_model.feature_importances_

# Combine word embedding feature names with 'skin' and 'face'
word_embedding_feature_names = ['word_embedding_' + str(i) for i in range(X_embeddings.shape[1])]
other_feature_names = ['skin', 'face']
feature_names = word_embedding_feature_names + other_feature_names

# Pair feature names with their importances
features = dict(zip(feature_names, feature_importances))
sorted_features = sorted(features.items(), key=lambda x: x[1], reverse=True)

# Displaying feature importances
for feature, importance in sorted_features:
    print(f"Feature: {feature}, Importance: {importance}")

XGBoost Accuracy: 0.7064989517819706
Feature: word_embedding_191, Importance: 0.018182197585701942
Feature: word_embedding_82, Importance: 0.018125800415873528
Feature: word_embedding_60, Importance: 0.014764705672860146
Feature: word_embedding_66, Importance: 0.013874349184334278
Feature: word_embedding_287, Importance: 0.01170937530696392
Feature: word_embedding_99, Importance: 0.011149009689688683
Feature: word_embedding_226, Importance: 0.011026379652321339
Feature: word_embedding_176, Importance: 0.010935235768556595
Feature: face, Importance: 0.010844764299690723
Feature: word_embedding_291, Importance: 0.010560356080532074
Feature: word_embedding_198, Importance: 0.01051478274166584
Feature: word_embedding_52, Importance: 0.010360250249505043
Feature: word_embedding_216, Importance: 0.01027311198413372
Feature: word_embedding_124, Importance: 0.010083691217005253
Feature: word_embedding_196, Importance: 0.009522038511931896
Feature: word_embedding_48, Importance: 0.0094866883009

In [None]:
# import other data