### Data preparation

In [None]:
import os
import re
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def preprocess(text):
  text = text.lower()  # Convert to lowercase
  text = re.sub(r'\d+', '', text)  # Remove digits
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
  text = re.sub(r'\s+', ' ', text)
  return text.strip()

#### Books

In [None]:
lit_corpus_male = []
lit_corpus_female = []
for root, dirs, files in os.walk('/content/drive/MyDrive/Master_s/HLT/data/books'):
  for file in files:
    if file.startswith('m_'):
      with open(os.path.join(root, file), 'r') as f:
        lit_corpus_male.append(f.read())
    else:
      with open(os.path.join(root, file), 'r') as f:
        lit_corpus_female.append(f.read())

In [None]:
paragraphs_male = []
paragraphs_female = []
for i in range(len(lit_corpus_male)):
  paragraphs_male.extend([preprocess(paragraph) for paragraph in lit_corpus_male[i].split('\n\n')])
  paragraphs_female.extend([preprocess(paragraph) for paragraph in lit_corpus_female[i].split('\n\n')])

In [None]:
df_books = pd.concat([pd.DataFrame({'paragraph': paragraphs_male, 'gender': 0}),
                      pd.DataFrame({'paragraph': paragraphs_female, 'gender': 1})],
                     ignore_index=True)

In [None]:
df_books['paragraph'] = df_books['paragraph'].apply(lambda x: x.strip())

In [None]:
df_books = df_books[df_books['paragraph'].apply(len) > 2].reset_index(drop=True)

#### Songs

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('/content/drive/MyDrive/Master_s/HLT/data/songs/spotify_millsongdata.csv.zip', 'r') as z:
    with z.open('spotify_millsongdata.csv') as f:
        df_songs_spotify = pd.read_csv(f)

In [None]:
with open ('/content/drive/MyDrive/Master_s/HLT/data/songs/male_songwriters', 'r') as f:
  male_songwriters = f.readlines()
with open ('/content/drive/MyDrive/Master_s/HLT/data/songs/female_songwriters', 'r') as f:
  female_songwriters = f.readlines()

In [None]:
df_songs_spotify['gender'] = -1

In [None]:
df_songs_spotify.loc[df_songs_spotify['artist'].isin([artist.strip() for artist in male_songwriters]), 'gender'] = 0
df_songs_spotify.loc[df_songs_spotify['artist'].isin([artist.strip() for artist in female_songwriters]), 'gender'] = 1

In [None]:
df_songs_spotify = df_songs_spotify[df_songs_spotify['gender'].isin([0, 1])][['text', 'gender']].reset_index(drop=True)

In [None]:
new_rows = []
for index, row in df_songs_spotify.iterrows():
    paragraphs = row['text'].split('\r\n  \r\n')
    for paragraph in paragraphs:
        if paragraph.strip():  # Check if the paragraph is not empty
            new_row = row.copy()
            new_row['paragraph'] = preprocess(re.sub(r'\[.*]', '', paragraph))
            new_rows.append(new_row)

In [None]:
df_songs = pd.DataFrame(new_rows)

In [None]:
df_songs = df_songs[df_songs['paragraph'].apply(lambda x: 'lyrics' not in x)].reset_index(drop=True).drop(columns=['text'])

### Models

#### features extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# X = vectorizer.fit_transform(df_books['paragraph'])
# y = y_train = df_books['gender']

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(df_books['paragraph'])
y_train = df_books['gender']

In [None]:
X_test = vectorizer.fit_transform(df_songs['paragraph'])
y_test = df_songs['gender']

#### SVM

In [None]:
from sklearn.svm import SVC
# from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, KFold

In [None]:
svm_model = SVC(kernel='linear', random_state=5)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

0.5871797243548238

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.87      0.72     13454
           1       0.37      0.13      0.19      8168

    accuracy                           0.59     21622
   macro avg       0.49      0.50      0.46     21622
weighted avg       0.53      0.59      0.52     21622



In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=kf, scoring='accuracy')

In [None]:
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.91218354 0.91337025 0.91254452 0.90779581 0.90779581]
Mean cross-validation score: 0.9107379854533069


#### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb_model = MultinomialNB()

In [None]:
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.56      0.59     13454
           1       0.38      0.45      0.41      8168

    accuracy                           0.52     21622
   macro avg       0.51      0.51      0.50     21622
weighted avg       0.54      0.52      0.53     21622



In [None]:
cv_scores = cross_val_score(nb_model, X_train, y_train, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.89240506 0.89754747 0.8895924  0.89315394 0.8895924 ]
Mean cross-validation score: 0.8924582546472777


#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_model = LogisticRegression(max_iter=1000, random_state=5)

In [None]:
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.98      0.76     13454
           1       0.33      0.01      0.03      8168

    accuracy                           0.62     21622
   macro avg       0.47      0.50      0.39     21622
weighted avg       0.51      0.62      0.48     21622



In [None]:
cv_scores = cross_val_score(lr_model, X_train, y_train, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.89121835 0.89596519 0.89790265 0.89473684 0.88642659]
Mean cross-validation score: 0.89324992611442


#### Decision trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_model = DecisionTreeClassifier(random_state=0)

In [None]:
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.13      0.21     13454
           1       0.38      0.88      0.53      8168

    accuracy                           0.41     21622
   macro avg       0.51      0.50      0.37     21622
weighted avg       0.54      0.41      0.33     21622



In [None]:
cv_scores = cross_val_score(dt_model, X_train, y_train, cv=kf, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.82476266 0.81685127 0.83419074 0.83696082 0.83339929]
Mean cross-validation score: 0.8292329549723743


In [None]:
1