### Data preparation

In [None]:
import os
import re
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def preprocess(text):
  text = text.lower()
  text = re.sub(r'\d+', '', text)
  text = re.sub(r'[^\w\s]', '', text)
  text = re.sub(r'\s+', ' ', text)
  return text.strip()

#### Books

In [None]:
lit_corpus_male = []
lit_corpus_female = []
for root, dirs, files in os.walk('/content/drive/MyDrive/Master_s/HLT/data/books'):
  for file in files:
    if file.startswith('m_'):
      with open(os.path.join(root, file), 'r') as f:
        lit_corpus_male.append(f.read())
    else:
      with open(os.path.join(root, file), 'r') as f:
        lit_corpus_female.append(f.read())

In [None]:
paragraphs_male = []
paragraphs_female = []
for i in range(len(lit_corpus_male)):
  paragraphs_male.extend([preprocess(paragraph) for paragraph in lit_corpus_male[i].split('\n\n')])
for i in range(len(lit_corpus_female)):
  paragraphs_female.extend([preprocess(paragraph) for paragraph in lit_corpus_female[i].split('\n\n')])

In [None]:
df_books = pd.concat([pd.DataFrame({'paragraph': paragraphs_male, 'gender': 0}),
                      pd.DataFrame({'paragraph': paragraphs_female, 'gender': 1})],
                     ignore_index=True)

In [None]:
df_books['paragraph'] = df_books['paragraph'].apply(lambda x: x.strip())

In [None]:
df_books = df_books[df_books['paragraph'].apply(len) > 5].reset_index(drop=True)

#### Songs

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('/content/drive/MyDrive/Master_s/HLT/data/songs/spotify_millsongdata.csv.zip', 'r') as z:
    with z.open('spotify_millsongdata.csv') as f:
        df_songs_spotify = pd.read_csv(f)

In [None]:
with open ('/content/drive/MyDrive/Master_s/HLT/data/songs/male_songwriters', 'r') as f:
  male_songwriters = f.readlines()
with open ('/content/drive/MyDrive/Master_s/HLT/data/songs/female_songwriters', 'r') as f:
  female_songwriters = f.readlines()

In [None]:
df_songs_spotify['gender'] = -1

In [None]:
df_songs_spotify.loc[df_songs_spotify['artist'].isin([artist.strip() for artist in male_songwriters]), 'gender'] = 0
df_songs_spotify.loc[df_songs_spotify['artist'].isin([artist.strip() for artist in female_songwriters]), 'gender'] = 1

In [None]:
df_songs_spotify = df_songs_spotify[df_songs_spotify['gender'].isin([0, 1])][['text', 'gender']].reset_index(drop=True)

In [None]:
new_rows = []
for index, row in df_songs_spotify.iterrows():
    paragraphs = row['text'].split('\r\n  \r\n')
    for paragraph in paragraphs:
        if paragraph.strip():
            new_row = row.copy()
            new_row['paragraph'] = preprocess(re.sub(r'\[.*]', '', paragraph))
            new_rows.append(new_row)

In [None]:
df_songs = pd.DataFrame(new_rows)

In [None]:
df_songs = df_songs[df_songs['paragraph'].apply(lambda x: 'lyrics' not in x)].reset_index(drop=True).drop(columns=['text'])

In [None]:
df_songs = df_songs[df_songs['paragraph'].apply(len) > 5].reset_index(drop=True)

#### datasets statistics

In [None]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df_books['paragraph'].apply(lambda text: len(word_tokenize(text))).describe()

count    23240.000000
mean        42.845740
std         62.079005
min          1.000000
25%          9.000000
50%         20.000000
75%         52.000000
max       1489.000000
Name: paragraph, dtype: float64

In [None]:
print("Tokens in books data: ", df_books['paragraph'].apply(lambda text: len(word_tokenize(text))).sum())

Tokens in books data:  995735


In [None]:
df_songs['paragraph'].apply(lambda text: len(word_tokenize(text))).describe()

count    18272.000000
mean        36.611044
std         32.870378
min          1.000000
25%         21.000000
50%         30.000000
75%         43.000000
max        663.000000
Name: paragraph, dtype: float64

In [None]:
print("Tokens in songs data: ", df_songs['paragraph'].apply(lambda text: len(word_tokenize(text))).sum())

Tokens in songs data:  668957


In [None]:
df_books['gender'].value_counts()

gender
1    11702
0    11538
Name: count, dtype: int64

In [None]:
df_songs['gender'].value_counts()

gender
0    9180
1    9092
Name: count, dtype: int64

### ML Models

#### features extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_books = vectorizer.fit_transform(df_books['paragraph'])
y_books = df_books['gender']

In [None]:
X_songs = vectorizer.fit_transform(df_songs['paragraph'])
y_songs = df_songs['gender']

#### SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV

In [None]:
svm_model_on_books = SVC(kernel='linear', random_state=5)
svm_model_on_books.fit(X_books, y_books)
y_pred = svm_model_on_books.predict(X_songs)

In [None]:
accuracy_score(y_songs, y_pred)

0.49934325744308233

In [None]:
print(classification_report(y_songs, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.52      0.51      9180
           1       0.50      0.48      0.49      9092

    accuracy                           0.50     18272
   macro avg       0.50      0.50      0.50     18272
weighted avg       0.50      0.50      0.50     18272



In [None]:
svm_model_on_songs = SVC(kernel='linear', random_state=5)
svm_model_on_songs.fit(X_songs, y_songs)
y_pred = svm_model_on_songs.predict(X_books)

In [None]:
accuracy_score(y_books, y_pred)

0.4855421686746988

In [None]:
print(classification_report(y_books, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.53      0.51     11538
           1       0.49      0.44      0.46     11702

    accuracy                           0.49     23240
   macro avg       0.49      0.49      0.48     23240
weighted avg       0.49      0.49      0.48     23240



CV

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)

cross-validation on books data:

In [None]:
svm_model_cv_books = SVC(kernel='linear', random_state=5)

In [None]:
cv_scores = cross_val_score(svm_model_cv_books, X_books, y_books, cv=kf, scoring='accuracy')

In [None]:
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.86854561 0.86402754 0.86015491 0.85434596 0.86639415]
Mean cross-validation score: 0.8626936316695353


cross-validation on songs data:

In [None]:
svm_model_cv_songs = SVC(kernel='linear', random_state=5)

In [None]:
cv_scores = cross_val_score(svm_model_cv_songs, X_songs, y_songs, cv=kf, scoring='accuracy')

In [None]:
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.71737346 0.7253078  0.7194855  0.72632731 0.71975917]
Mean cross-validation score: 0.721650646893347


#### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb_model_on_books = MultinomialNB()
nb_model_on_books.fit(X_books, y_books)
y_pred = nb_model_on_books.predict(X_songs)

In [None]:
print(classification_report(y_songs, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.26      0.34      9180
           1       0.50      0.74      0.60      9092

    accuracy                           0.50     18272
   macro avg       0.50      0.50      0.47     18272
weighted avg       0.50      0.50      0.47     18272



In [None]:
nb_model_on_songs = MultinomialNB()
nb_model_on_songs.fit(X_songs, y_songs)
y_pred = nb_model_on_songs.predict(X_books)

In [None]:
print(classification_report(y_books, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.61      0.53     11538
           1       0.44      0.30      0.36     11702

    accuracy                           0.46     23240
   macro avg       0.45      0.46      0.44     23240
weighted avg       0.45      0.46      0.44     23240



cv

In [None]:
nb_model_on_books_cv = MultinomialNB()
cv_scores = cross_val_score(nb_model_on_books_cv, X_books, y_books, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.86596386 0.8605852  0.85262478 0.85047332 0.85907917]
Mean cross-validation score: 0.8577452667814114


In [None]:
nb_model_on_songs_cv = MultinomialNB()
cv_scores = cross_val_score(nb_model_on_songs_cv, X_songs, y_songs, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.7119015  0.71928865 0.70908593 0.72194855 0.71182266]
Mean cross-validation score: 0.7148094586671878


#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 200, 500, 1000]
}
# kf = KFold(n_splits=5, shuffle=True, random_state=5)
grid_search_lr = GridSearchCV(LogisticRegression(random_state=5), param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_lr.fit(X_books, y_books)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print("Best parameters found: ", grid_search_lr.best_params_)
print("Best cross-validation score: ", grid_search_lr.best_score_)

Best parameters found:  {'C': 10, 'max_iter': 200, 'solver': 'lbfgs'}
Best cross-validation score:  0.8658347676419966


In [None]:
lr_model_on_books = LogisticRegression(max_iter=200, C=10, solver='liblinear', random_state=5)
lr_model_on_books.fit(X_books, y_books)
y_pred = lr_model_on_books.predict(X_songs)

In [None]:
print(classification_report(y_songs, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.46      0.48      9180
           1       0.49      0.53      0.51      9092

    accuracy                           0.49     18272
   macro avg       0.49      0.49      0.49     18272
weighted avg       0.49      0.49      0.49     18272



In [None]:
lr_model_on_songs = LogisticRegression(max_iter=200, C=10, solver='liblinear', random_state=5)
lr_model_on_songs.fit(X_songs, y_songs)
y_pred = lr_model_on_songs.predict(X_books)

In [None]:
print(classification_report(y_books, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.48      0.48     11538
           1       0.48      0.47      0.48     11702

    accuracy                           0.48     23240
   macro avg       0.48      0.48      0.48     23240
weighted avg       0.48      0.48      0.48     23240



cv

In [None]:
lr_model_on_books_cv = LogisticRegression(max_iter=200, C=10, solver='liblinear', random_state=5)
cv_scores = cross_val_score(lr_model_on_books_cv, X_books, y_books, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.86876076 0.8689759  0.86273666 0.85886403 0.86962134]
Mean cross-validation score: 0.8657917383820999


In [None]:
lr_model_on_songs_cv = LogisticRegression(max_iter=200, C=10, solver='liblinear', random_state=5)
cv_scores = cross_val_score(lr_model_on_songs_cv, X_songs, y_songs, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.73132695 0.74062927 0.72605364 0.72851669 0.71866448]
Mean cross-validation score: 0.7290382071032101


#### Decision trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
param_grid_dt = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=5), param_grid_dt, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_dt.fit(X_books, y_books)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [None]:
print("Best parameters found: ", grid_search_dt.best_params_)
print("Best cross-validation score: ", grid_search_dt.best_score_)

Best parameters found:  {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best cross-validation score:  0.7501721170395869


In [None]:
dt_model_on_books = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_leaf=1, min_samples_split=10, random_state=0)
dt_model_on_books.fit(X_books, y_books)
y_pred = dt_model_on_books.predict(X_songs)

In [None]:
print(classification_report(y_songs, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.90      0.65      9180
           1       0.56      0.13      0.21      9092

    accuracy                           0.52     18272
   macro avg       0.53      0.51      0.43     18272
weighted avg       0.53      0.52      0.43     18272



In [None]:
dt_model_on_songs = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_leaf=1, min_samples_split=10, random_state=0)
dt_model_on_songs.fit(X_songs, y_songs)
y_pred = dt_model_on_songs.predict(X_books)

In [None]:
print(classification_report(y_books, y_pred))

              precision    recall  f1-score   support

           0       0.32      0.02      0.03     11538
           1       0.50      0.97      0.66     11702

    accuracy                           0.49     23240
   macro avg       0.41      0.49      0.34     23240
weighted avg       0.41      0.49      0.35     23240



cv

In [None]:
dt_model_cv_books = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_leaf=1, min_samples_split=10, random_state=0)
cv_scores = cross_val_score(dt_model_cv_books, X_books, y_books, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.75494836 0.75064544 0.75021515 0.74505164 0.7439759 ]
Mean cross-validation score: 0.7489672977624784


In [None]:
dt_model_cv_songs = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_leaf=1, min_samples_split=10, random_state=0)
cv_scores = cross_val_score(dt_model_cv_songs, X_songs, y_songs, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.62380301 0.62708618 0.62534209 0.61904762 0.61877395]
Mean cross-validation score: 0.6228105698307124


#### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [None]:
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=5), param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_rf.fit(X_books, y_books)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [None]:
print("Best parameters found: ", grid_search_rf.best_params_)
print("Best cross-validation score: ", grid_search_rf.best_score_)

Best parameters found:  {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score:  0.7829173838209984


In [None]:
rf_model_on_books = RandomForestClassifier(**grid_search_rf.best_params_, random_state=0)
rf_model_on_books.fit(X_books, y_books)
y_pred_books = rf_model_on_books.predict(X_songs)

In [None]:
print(classification_report(y_songs, y_pred_books))

              precision    recall  f1-score   support

           0       0.51      0.91      0.65      9180
           1       0.54      0.10      0.17      9092

    accuracy                           0.51     18272
   macro avg       0.52      0.51      0.41     18272
weighted avg       0.52      0.51      0.41     18272



In [None]:
rf_model_on_songs = RandomForestClassifier(**grid_search_rf.best_params_, random_state=0)
rf_model_on_songs.fit(X_songs, y_songs)
y_pred_songs = rf_model_on_songs.predict(X_books)

In [None]:
print(classification_report(y_books, y_pred_songs))

              precision    recall  f1-score   support

           0       0.48      0.24      0.32     11538
           1       0.50      0.75      0.60     11702

    accuracy                           0.49     23240
   macro avg       0.49      0.49      0.46     23240
weighted avg       0.49      0.49      0.46     23240



cv

In [None]:
rf_model_cv_books = RandomForestClassifier(**grid_search_rf.best_params_, random_state=0)
cv_scores_books = cross_val_score(rf_model_cv_books, X_books, y_books, cv=kf, scoring='accuracy')
print(f'Cross-validation scores on Books dataset: {cv_scores_books}')
print(f'Mean cross-validation score on Books dataset: {cv_scores_books.mean()}')

Cross-validation scores on Books dataset: [0.78915663 0.78614458 0.7792599  0.78119621 0.77517212]
Mean cross-validation score on Books dataset: 0.7821858864027539


In [None]:
rf_model_cv_songs = RandomForestClassifier(**grid_search_rf.best_params_, random_state=0)
cv_scores_songs = cross_val_score(rf_model_cv_songs, X_songs, y_songs, cv=kf, scoring='accuracy')
print(f'Cross-validation scores on Songs dataset: {cv_scores_songs}')
print(f'Mean cross-validation score on Songs dataset: {cv_scores_songs.mean()}')

Cross-validation scores on Songs dataset: [0.69575923 0.69767442 0.69731801 0.7014231  0.68801314]
Mean cross-validation score on Songs dataset: 0.6960375788914871
