## Notebook Containing Text Classification Modeling Pipelines
### ACL22 SRW confidential submission
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import warnings
warnings.simplefilter('ignore')
import glob
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from transformers import BertModel, BertTokenizer
from transformers import logging
logging.set_verbosity_error()

from helpers import stratified_kfold, run_gender_model, get_bert_average_across_text_tokens, run_author_attribution_model


In [2]:
# authorship profiling internal vars

experiment_name = 'final-refined-set-old' #folder name for saving new outputs

write_data_dir = 'acl22-data/final-data/cleaned-data-authorship-profiling/'+experiment_name
if not os.path.isdir(write_data_dir):
    os.mkdir(write_data_dir)

write_model_dir = 'models/final-refined-set-old'
if not os.path.isdir(write_model_dir):
    os.mkdir(write_model_dir)

In [3]:
"""
1. gender profiling data partitioning
"""

#first do twitter

data_path = 'acl22-data/intermediate-data/cleaned-data-authorship-profiling/final-refined-set/twitter_celeb_profiling.csv'
df = pd.read_csv(data_path, index_col='Unnamed: 0')
print(df.columns)
print(df.shape)
print(df.sample(2), '\n')
#Check for missing values
missing_df = df.isna().sum()
missing_df = pd.DataFrame({'variable' : missing_df.index, 'count' : missing_df.values})
missing_df['ratio'] = missing_df['count']/df.shape[0]
missing_df = missing_df[missing_df['count'] > 0]
print('There are', len(missing_df), 'NA values \n')
#missing_df.sort_values(by=['ratio'], inplace=True)
#missing_df.plot(kind='barh', x='variable', y='ratio', title='% rows of missing values', sort_columns=True)

df['target'][df['target']=='F'] = 0
df['target'][df['target']=='M'] = 1
print(f"There are {str(len(df['target'][df['target']==0])/len(df['target'])*100)[0:5]}% of Female profiles in the dataset. \n")

df_modeling = stratified_kfold(df, 5)
print(df_modeling.sample(2))
df_modeling.to_csv(write_data_dir+'/twitter_celeb_profiling_folded.csv')



#second do instagram/twitter mixed

data_path = 'acl22-data/intermediate-data/cleaned-data-authorship-profiling/final-refined-set/mixed_celeb_profiling.csv'
df = pd.read_csv(data_path, index_col='Unnamed: 0')
print(df.columns)
print(df.shape)
print(df.sample(2), '\n')
#Check for missing values
missing_df = df.isna().sum()
missing_df = pd.DataFrame({'variable' : missing_df.index, 'count' : missing_df.values})
missing_df['ratio'] = missing_df['count']/df.shape[0]
missing_df = missing_df[missing_df['count'] > 0]
print('There are', len(missing_df), 'NA values \n')
#missing_df.sort_values(by=['ratio'], inplace=True)
#missing_df.plot(kind='barh', x='variable', y='ratio', title='% rows of missing values', sort_columns=True)

df['target'][df['target']=='F'] = 0
df['target'][df['target']=='M'] = 1
print(f"There are {str(len(df['target'][df['target']==0])/len(df['target'])*100)[0:5]}% of Female profiles in the dataset. \n")

df_modeling = stratified_kfold(df, 5)
print(df_modeling.sample(2))
df_modeling.to_csv(write_data_dir+'/mixed_celeb_profiling_folded.csv')

Index(['tweet', 'target'], dtype='object')
(372, 2)
                                                 tweet  target
310  Good to know I actually meant to text that jus...       1
110  Yung tawa ko Nakakamiss among us <url><sep>. M...       0 

There are 1 NA values 

There are 37.63% of Female profiles in the dataset. 

                                                 tweet  target  fold
13   daddy got us matching rings <url><sep>. @<user...       0   1.0
154  Happening to me <url><sep>. Incredible Congrat...       1   1.0
Index(['post', 'target'], dtype='object')
(372, 2)
                                                  post  target
56   <url><sep>. <sep>. This is not true Dont belie...       0
175  Say you dont drink cause you get Chosen Weedmi...       1 

There are 0 NA values 

There are 37.63% of Female profiles in the dataset. 

                                                post  target  fold
8  # rembeautyexperience thank you @<user> for th...       0   1.0
2  the RIH ISSUE c

In [6]:
"""
2. gender profiling modeling part 1: HPO cross val training 
"""
#results may slightly vary due to random number generator differences

# first do twitter model

df_modeling = pd.read_csv(write_data_dir+'/twitter_celeb_profiling_folded.csv', index_col = 'Unnamed: 0')
df_modeling

# vectorizer settings set to Radivchev et al. (ACL18)
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

#SVMs
print('Support vector machine model training twitter only dataset:\n')
clf = SVC(class_weight="balanced")
param_grid = {"C":[0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2], 
              "kernel":["linear", "poly", "rbf", "sigmoid"], 
              "gamma":["scale", "auto"]
             }

stats_list = []
model_list = []

for itr in range(1,6):
    #neptune.create_experiment(f'jupyter-pass-SVM-mix-fold{itr}',tags=['Mix', "Gender Profiling", "SVM"])
    print('fold ', itr)
    model, stats = run_gender_model(clf,
                         param_grid,
                         X = df_modeling[['post', 'fold']], 
                         y = df_modeling[['target', 'fold']],
                         k = itr,
                         vectorizer = vectorizer)
    print(stats)
    print(model, '\n')
    
    stats_list.append(stats)
    model_list.append(model)
    
    pickle.dump(model, open(write_model_dir+f"/profiling-SVM-fold{itr}-twitter.pkl", "wb"))
    #neptune.log_artifact(f"modeling/gender-profiling/SVM/SVM-fold{itr}-mix.pkl")
    #eptune.log_metric('Validation-accuracy', stats['accuracy'])
    #neptune.log_metric('Validation-roc_auc', stats['roc_auc'])
    #neptune.log_metric('Validation-f1', stats['f1'])
    #neptune.log_metric('Fold', itr)


#LRs
print('Logistic regression model training twitter only dataset:\n')
clf = LogisticRegression(class_weight="balanced")
param_grid = {"penalty":['l1', 'l2', 'elasticnet', 'none'], 
              "solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }

stats_list = []
model_list = []

for itr in range(1,6):
    #neptune.create_experiment(f'jupyter-pass-SVM-mix-fold{itr}',tags=['Mix', "Gender Profiling", "SVM"])
    print('fold ', itr)
    model, stats = run_gender_model(clf,
                         param_grid,
                         X = df_modeling[['post', 'fold']], 
                         y = df_modeling[['target', 'fold']],
                         k = itr,
                         vectorizer = vectorizer)
    print(stats)
    print(model, '\n')
    
    stats_list.append(stats)
    model_list.append(model)
    
    pickle.dump(model, open(write_model_dir+f"/profiling-LR-fold{itr}-twitter.pkl", "wb"))
    #neptune.log_artifact(f"modeling/gender-profiling/SVM/SVM-fold{itr}-mix.pkl")
    #eptune.log_metric('Validation-accuracy', stats['accuracy'])
    #neptune.log_metric('Validation-roc_auc', stats['roc_auc'])
    #neptune.log_metric('Validation-f1', stats['f1'])
    #neptune.log_metric('Fold', itr)


# second do twitter/instagram mixed

df_modeling = pd.read_csv(write_data_dir+'/mixed_celeb_profiling_folded.csv', index_col = 'Unnamed: 0')
df_modeling

# vectorizer settings set to Radivchev et al. (ACL18)
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

#SVMs
print('Support vector machine model training platform-mixed:\n')
clf = SVC(class_weight="balanced")
param_grid = {"C":[0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2], 
              "kernel":["linear", "poly", "rbf", "sigmoid"], 
              "gamma":["scale", "auto"]
             }

stats_list = []
model_list = []

for itr in range(1,6):
    #neptune.create_experiment(f'jupyter-pass-SVM-mix-fold{itr}',tags=['Mix', "Gender Profiling", "SVM"])
    print('fold ', itr)
    model, stats = run_gender_model(clf,
                         param_grid,
                         X = df_modeling[['post', 'fold']], 
                         y = df_modeling[['target', 'fold']],
                         k = itr,
                         vectorizer = vectorizer)
    print(stats)
    print(model, '\n')
    
    stats_list.append(stats)
    model_list.append(model)
    
    pickle.dump(model, open(write_model_dir+f"/profiling-SVM-fold{itr}-mix.pkl", "wb"))
    #neptune.log_artifact(f"modeling/gender-profiling/SVM/SVM-fold{itr}-mix.pkl")
    #eptune.log_metric('Validation-accuracy', stats['accuracy'])
    #neptune.log_metric('Validation-roc_auc', stats['roc_auc'])
    #neptune.log_metric('Validation-f1', stats['f1'])
    #neptune.log_metric('Fold', itr)


#LRs
print('Logistic regression model training  platform-mixed:\n')
clf = LogisticRegression(class_weight="balanced")
param_grid = {"penalty":['l1', 'l2', 'elasticnet', 'none'], 
              "solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }

stats_list = []
model_list = []

for itr in range(1,6):
    #neptune.create_experiment(f'jupyter-pass-SVM-mix-fold{itr}',tags=['Mix', "Gender Profiling", "SVM"])
    print('fold ', itr)
    model, stats = run_gender_model(clf,
                         param_grid,
                         X = df_modeling[['post', 'fold']], 
                         y = df_modeling[['target', 'fold']],
                         k = itr,
                         vectorizer = vectorizer)
    print(stats)
    print(model, '\n')
    
    stats_list.append(stats)
    model_list.append(model)
    
    pickle.dump(model, open(write_model_dir+f"/profiling-LR-fold{itr}-mix.pkl", "wb"))
    #neptune.log_artifact(f"modeling/gender-profiling/SVM/SVM-fold{itr}-mix.pkl")
    #eptune.log_metric('Validation-accuracy', stats['accuracy'])
    #neptune.log_metric('Validation-roc_auc', stats['roc_auc'])
    #neptune.log_metric('Validation-f1', stats['f1'])
    #neptune.log_metric('Fold', itr)
    

Support vector machine model training twitter only dataset:

fold  1
n_samples train: 298, n_features train: 10000
n_samples val: 74, n_features val: 10000
{'accuracy': 0.7972972972972973, 'roc_auc': 0.7950310559006211, 'f1': 0.8314606741573034}
SVC(C=1, class_weight='balanced', kernel='sigmoid') 

fold  2
n_samples train: 298, n_features train: 10000
n_samples val: 74, n_features val: 10000
{'accuracy': 0.8918918918918919, 'roc_auc': 0.8850931677018634, 'f1': 0.9130434782608695}
SVC(C=1, class_weight='balanced', kernel='sigmoid') 

fold  3
n_samples train: 298, n_features train: 10000
n_samples val: 74, n_features val: 10000
{'accuracy': 0.8108108108108109, 'roc_auc': 0.8059006211180125, 'f1': 0.8444444444444444}
SVC(C=1, class_weight='balanced', kernel='sigmoid') 

fold  4
n_samples train: 298, n_features train: 10000
n_samples val: 74, n_features val: 10000
{'accuracy': 0.8513513513513513, 'roc_auc': 0.8524844720496896, 'f1': 0.8764044943820224}
SVC(C=1.2, class_weight='balanced', k

In [7]:
"""
3. gender profiling modeling part 2: re-fitting winning fold on entire dataset  
"""

# Final model LR mixed instagram + twitter (fold 3)

df_modeling = pd.read_csv(write_data_dir+'/mixed_celeb_profiling_folded.csv', index_col = 'Unnamed: 0')
# tfidf featurizer
X = df_modeling[['post']]
y = df_modeling[['target']]
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)
X_train, y_train = [X['post'], np.array(y['target']).astype(int)]    
# redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
model = pickle.load(open("models/final-refined-set-old/profiling-LR-fold3-mix.pkl", 'rb'))
model.fit(X_train_tf, y_train)
pickle.dump(model, open(f"models/final-refined-set-old/final-profiling-LR-mix.pkl", "wb"))
print('model written to file \n')


# Final model SVM mixed instagram + twitter (fold 3)

df_modeling = pd.read_csv(write_data_dir+'/mixed_celeb_profiling_folded.csv', index_col = 'Unnamed: 0')
# tfidf featurizer
X = df_modeling[['post']]
y = df_modeling[['target']]
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)
X_train, y_train = [X['post'], np.array(y['target']).astype(int)]    
# redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
model = pickle.load(open("models/final-refined-set-old/profiling-SVM-fold3-mix.pkl", 'rb'))
model.fit(X_train_tf, y_train)
pickle.dump(model, open(f"models/final-refined-set-old/final-profiling-SVM-mix.pkl", "wb"))
print('model written to file \n')


# Final model LR twitter (fold 2)

df_modeling = pd.read_csv(write_data_dir+'/twitter_celeb_profiling_folded.csv', index_col = 'Unnamed: 0')
# tfidf featurizer
X = df_modeling[['post']]
y = df_modeling[['target']]
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)
X_train, y_train = [X['post'], np.array(y['target']).astype(int)]    
# redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
model = pickle.load(open("models/final-refined-set-old/profiling-LR-fold2-twitter.pkl", 'rb'))
model.fit(X_train_tf, y_train)
pickle.dump(model, open(f"models/final-refined-set-old/final-profiling-LR-twitter.pkl", "wb"))
print('model written to file \n')


# Final model SVM twitter (fold 2)

df_modeling = pd.read_csv(write_data_dir+'/twitter_celeb_profiling_folded.csv', index_col = 'Unnamed: 0')
# tfidf featurizer
X = df_modeling[['post']]
y = df_modeling[['target']]
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)
X_train, y_train = [X['post'], np.array(y['target']).astype(int)]    
# redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
model = pickle.load(open("models/final-refined-set-old/profiling-SVM-fold2-twitter.pkl", 'rb'))
model.fit(X_train_tf, y_train)
pickle.dump(model, open(f"models/final-refined-set-old/final-profiling-SVM-twitter.pkl", "wb"))
print('model written to file \n')



n_samples train: 372, n_features train: 10000
model written to file 

n_samples train: 372, n_features train: 10000
model written to file 

n_samples train: 372, n_features train: 10000
model written to file 

n_samples train: 372, n_features train: 10000
model written to file 



In [None]:
# authorship attribution internal vars

experiment_name = 'final-refined-set' #folder name for saving new outputs

write_data_dir = 'acl22-data/final-data/cleaned-data-authorship-attribution/'+experiment_name
if not os.path.isdir(write_data_dir):
    os.mkdir(write_data_dir)

write_model_dir = 'models/final-refined-set-old'
if not os.path.isdir(write_model_dir):
    os.mkdir(write_model_dir)

In [None]:
"""
4. authorship attribution data partitioning
"""

#first do twitter

data_path = 'acl22-data/intermediate-data/cleaned-data-authorship-attribution/final-refined-set/twitter_celeb_attribution.csv'
df = pd.read_csv(data_path, index_col='Unnamed: 0')
print(df.columns)
print(df.shape)
print(df.sample(2), '\n')
#Check for missing values
missing_df = df.isna().sum()
missing_df = pd.DataFrame({'variable' : missing_df.index, 'count' : missing_df.values})
missing_df['ratio'] = missing_df['count']/df.shape[0]
missing_df = missing_df[missing_df['count'] > 0]
print('There are', len(missing_df), 'NA values \n')
#missing_df.sort_values(by=['ratio'], inplace=True)
#missing_df.plot(kind='barh', x='variable', y='ratio', title='% rows of missing values', sort_columns=True)

df_modeling = stratified_kfold(df, 5)
print(df_modeling.sample(2))
df_modeling.to_csv(write_data_dir+'/twitter_celeb_attribution_folded.csv')



#second do instagram/twitter mixed

data_path = 'acl22-data/intermediate-data/cleaned-data-authorship-attribution/final-refined-set/mixed_celeb_attribution.csv'
df = pd.read_csv(data_path, index_col='Unnamed: 0')
print(df.columns)
print(df.shape)
print(df.sample(2), '\n')
#Check for missing values
missing_df = df.isna().sum()
missing_df = pd.DataFrame({'variable' : missing_df.index, 'count' : missing_df.values})
missing_df['ratio'] = missing_df['count']/df.shape[0]
missing_df = missing_df[missing_df['count'] > 0]
print('There are', len(missing_df), 'NA values \n')
#missing_df.sort_values(by=['ratio'], inplace=True)
#missing_df.plot(kind='barh', x='variable', y='ratio', title='% rows of missing values', sort_columns=True)

df_modeling = stratified_kfold(df, 5)
print(df_modeling.sample(2))
df_modeling.to_csv(write_data_dir+'/mixed_celeb_attribution_folded.csv')

In [None]:
"""
5. authorship attribution modeling part 1: HPO cross val training 
"""
# results may slightly vary due to random number generator differences

# takes a long time to run, not ran for the example code as a result

# first do twitter model

df_modeling = pd.read_csv(write_data_dir+'/twitter_celeb_attribution_folded.csv', index_col = 'Unnamed: 0')
df_modeling

# vectorizer settings set to Radivchev et al. (ACL18)
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

#SVMs
print('Support vector machine model training twitter only dataset:\n')
clf = SVC(class_weight="balanced")
param_grid = {"C":[0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2], 
              "kernel":["linear", "poly", "rbf", "sigmoid"], 
              "gamma":["scale", "auto"]
             }

stats_list = []
model_list = []

for itr in range(1,6):
    #neptune.create_experiment(f'jupyter-pass-SVM-mix-fold{itr}',tags=['Mix', "authorship attribution", "SVM"])
    print('fold ', itr)
    model, stats = run_author_attribution_model(clf,
                         param_grid,
                         X = df_modeling[['post', 'fold']], 
                         y = df_modeling[['target', 'fold']],
                         k = itr,
                         vectorizer = vectorizer)
    print(stats)
    print(model, '\n')
    
    stats_list.append(stats)
    model_list.append(model)
    
    pickle.dump(model, open(write_model_dir+f"/attribution-SVM-fold{itr}-twitter.pkl", "wb"))
    #neptune.log_artifact(f"modeling/authorship-attribution/SVM/SVM-fold{itr}-mix.pkl")
    #eptune.log_metric('Validation-accuracy', stats['accuracy'])
    #neptune.log_metric('Validation-roc_auc', stats['roc_auc'])
    #neptune.log_metric('Validation-f1', stats['f1'])
    #neptune.log_metric('Fold', itr)


#LRs
print('Logistic regression model training twitter only dataset:\n')
clf = LogisticRegression(class_weight="balanced")
param_grid = {"penalty":['l1', 'l2', 'elasticnet', 'none'], 
              "solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }

stats_list = []
model_list = []

for itr in range(1,6):
    #neptune.create_experiment(f'jupyter-pass-SVM-mix-fold{itr}',tags=['Mix', "authorship attribution", "SVM"])
    print('fold ', itr)
    model, stats = run_author_attribution_model(clf,
                         param_grid,
                         X = df_modeling[['post', 'fold']], 
                         y = df_modeling[['target', 'fold']],
                         k = itr,
                         vectorizer = vectorizer)
    print(stats)
    print(model, '\n')
    
    stats_list.append(stats)
    model_list.append(model)
    
    pickle.dump(model, open(write_model_dir+f"/attribution-LR-fold{itr}-twitter.pkl", "wb"))
    #neptune.log_artifact(f"modeling/authorship-attribution/SVM/SVM-fold{itr}-mix.pkl")
    #eptune.log_metric('Validation-accuracy', stats['accuracy'])
    #neptune.log_metric('Validation-roc_auc', stats['roc_auc'])
    #neptune.log_metric('Validation-f1', stats['f1'])
    #neptune.log_metric('Fold', itr)



# second do twitter/instagram mixed

df_modeling = pd.read_csv(write_data_dir+'/mixed_celeb_attribution_folded.csv', index_col = 'Unnamed: 0')
df_modeling

# vectorizer settings set to Radivchev et al. (ACL18)
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

#SVMs
print('Support vector machine model training platform-mixed:\n')
clf = SVC(class_weight="balanced")
param_grid = {"C":[0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2], 
              "kernel":["linear", "poly", "rbf", "sigmoid"], 
              "gamma":["scale", "auto"]
             }

stats_list = []
model_list = []

for itr in range(1,6):
    #neptune.create_experiment(f'jupyter-pass-SVM-mix-fold{itr}',tags=['Mix', "authorship attribution", "SVM"])
    print('fold ', itr)
    model, stats = run_author_attribution_model(clf,
                         param_grid,
                         X = df_modeling[['post', 'fold']], 
                         y = df_modeling[['target', 'fold']],
                         k = itr,
                         vectorizer = vectorizer)
    print(stats)
    print(model, '\n')
    
    stats_list.append(stats)
    model_list.append(model)
    
    pickle.dump(model, open(write_model_dir+f"/attribution-SVM-fold{itr}-mix.pkl", "wb"))
    #neptune.log_artifact(f"modeling/authorship-attribution/SVM/SVM-fold{itr}-mix.pkl")
    #eptune.log_metric('Validation-accuracy', stats['accuracy'])
    #neptune.log_metric('Validation-roc_auc', stats['roc_auc'])
    #neptune.log_metric('Validation-f1', stats['f1'])
    #neptune.log_metric('Fold', itr)


#LRs
print('Logistic regression model training  platform-mixed:\n')
clf = LogisticRegression(class_weight="balanced")
param_grid = {"penalty":['l1', 'l2', 'elasticnet', 'none'], 
              "solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }

stats_list = []
model_list = []

for itr in range(1,6):
    #neptune.create_experiment(f'jupyter-pass-SVM-mix-fold{itr}',tags=['Mix', "authorship attribution", "SVM"])
    print('fold ', itr)
    model, stats = run_author_attribution_model(clf,
                         param_grid,
                         X = df_modeling[['post', 'fold']], 
                         y = df_modeling[['target', 'fold']],
                         k = itr,
                         vectorizer = vectorizer)
    print(stats)
    print(model, '\n')
    
    stats_list.append(stats)
    model_list.append(model)
    
    pickle.dump(model, open(write_model_dir+f"/attribution-LR-fold{itr}-mix.pkl", "wb"))
    #neptune.log_artifact(f"modeling/authorship-attribution/SVM/SVM-fold{itr}-mix.pkl")
    #eptune.log_metric('Validation-accuracy', stats['accuracy'])
    #neptune.log_metric('Validation-roc_auc', stats['roc_auc'])
    #neptune.log_metric('Validation-f1', stats['f1'])
    #neptune.log_metric('Fold', itr)
    

In [None]:
"""
6. authorship attribution modeling part 2: re-fitting winning fold on entire dataset  
"""

# Final model LR mixed instagram + twitter (fold 1)

df_modeling = pd.read_csv(write_data_dir+'/mixed_celeb_attribution_folded.csv', index_col = 'Unnamed: 0')
# tfidf featurizer
X = df_modeling[['post']]
y = df_modeling[['target']]
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)
X_train, y_train = [X['post'], np.array(y['target']).astype(int)]    
# redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)

X_train_tf_nonsparse = pd.DataFrame.sparse.from_spmatrix(X_train_tf)
    
bert_feats_list = []
df_modeling = df_modeling.reset_index(drop=True)
for i in range(0,len(X_train_tf_nonsparse)):
    bert_feats_list.append(pd.DataFrame(get_bert_average_across_text_tokens(string = df_modeling['post'].loc[i],
                                                                            tokenizer = BertTokenizer.from_pretrained('bert-base-cased'),
                                                                            model_bert = BertModel.from_pretrained('bert-base-cased'))).T)
bert_df = pd.concat(bert_feats_list)    
    
X_test_both = pd.merge(X_train_tf_nonsparse, bert_df, left_index=True, right_index=True, suffixes=('_tfidf', '_bert'))

model = pickle.load(open("models/final-refined-set-old/attribution-LR-fold1-mix.pkl", 'rb'))
model.fit(X_test_both, y_train)
pickle.dump(model, open(f"models/final-refined-set-old/final-attribution-LR-mix.pkl", "wb"))
print('model written to file \n')


# Final model SVM mixed instagram + twitter (fold 1)

df_modeling = pd.read_csv(write_data_dir+'/mixed_celeb_attribution_folded.csv', index_col = 'Unnamed: 0')
# tfidf featurizer
X = df_modeling[['post']]
y = df_modeling[['target']]
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)
X_train, y_train = [X['post'], np.array(y['target']).astype(int)]    
# redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)

X_train_tf_nonsparse = pd.DataFrame.sparse.from_spmatrix(X_train_tf)
    
bert_feats_list = []
df_modeling = df_modeling.reset_index(drop=True)
for i in range(0,len(X_train_tf_nonsparse)):
    bert_feats_list.append(pd.DataFrame(get_bert_average_across_text_tokens(string = df_modeling['post'].loc[i],
                                                                            tokenizer = BertTokenizer.from_pretrained('bert-base-cased'),
                                                                            model_bert = BertModel.from_pretrained('bert-base-cased'))).T)
bert_df = pd.concat(bert_feats_list)    
    
X_test_both = pd.merge(X_train_tf_nonsparse, bert_df, left_index=True, right_index=True, suffixes=('_tfidf', '_bert'))

model = pickle.load(open("models/final-refined-set-old/attribution-SVM-fold1-mix.pkl", 'rb'))
model.fit(X_test_both, y_train)
pickle.dump(model, open(f"models/final-refined-set-old/final-attribution-SVM-mix.pkl", "wb"))
print('model written to file \n')


# Final model LR twitter only (fold 1)

df_modeling = pd.read_csv(write_data_dir+'/twitter_celeb_attribution_folded.csv', index_col = 'Unnamed: 0')
# tfidf featurizer
X = df_modeling[['post']]
y = df_modeling[['target']]
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)
X_train, y_train = [X['post'], np.array(y['target']).astype(int)]    
# redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)

X_train_tf_nonsparse = pd.DataFrame.sparse.from_spmatrix(X_train_tf)
    
bert_feats_list = []
df_modeling = df_modeling.reset_index(drop=True)
for i in range(0,len(X_train_tf_nonsparse)):
    bert_feats_list.append(pd.DataFrame(get_bert_average_across_text_tokens(string = df_modeling['post'].loc[i],
                                                                            tokenizer = BertTokenizer.from_pretrained('bert-base-cased'),
                                                                            model_bert = BertModel.from_pretrained('bert-base-cased'))).T)
bert_df = pd.concat(bert_feats_list)    
    
X_test_both = pd.merge(X_train_tf_nonsparse, bert_df, left_index=True, right_index=True, suffixes=('_tfidf', '_bert'))

model = pickle.load(open("models/final-refined-set-old/attribution-LR-fold1-twitter.pkl", 'rb'))
model.fit(X_test_both, y_train)
pickle.dump(model, open(f"models/final-refined-set-old/final-attribution-LR-twitter.pkl", "wb"))
print('model written to file \n')


# Final model SVM twitter only (fold 1)

df_modeling = pd.read_csv(write_data_dir+'/twitter_celeb_attribution_folded.csv', index_col = 'Unnamed: 0')
# tfidf featurizer
X = df_modeling[['post']]
y = df_modeling[['target']]
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)
X_train, y_train = [X['post'], np.array(y['target']).astype(int)]    
# redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)

X_train_tf_nonsparse = pd.DataFrame.sparse.from_spmatrix(X_train_tf)
    
bert_feats_list = []
df_modeling = df_modeling.reset_index(drop=True)
for i in range(0,len(X_train_tf_nonsparse)):
    bert_feats_list.append(pd.DataFrame(get_bert_average_across_text_tokens(string = df_modeling['post'].loc[i],
                                                                            tokenizer = BertTokenizer.from_pretrained('bert-base-cased'),
                                                                            model_bert = BertModel.from_pretrained('bert-base-cased'))).T)
bert_df = pd.concat(bert_feats_list)    
    
X_test_both = pd.merge(X_train_tf_nonsparse, bert_df, left_index=True, right_index=True, suffixes=('_tfidf', '_bert'))

model = pickle.load(open("models/final-refined-set-old/attribution-SVM-fold1-twitter.pkl", 'rb'))
model.fit(X_test_both, y_train)
pickle.dump(model, open(f"models/final-refined-set-old/final-attribution-SVM-twitter.pkl", "wb"))
print('model written to file \n')
