# Machine Learning

### Loading packages

In [None]:
# Standard
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import time
from joblib import parallel_backend

# Machine Learning
from sklearn import metrics # Used to get performance metrics from models
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB

# Validation of models
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.decomposition import TruncatedSVD
import scipy
from sklearn.svm import SVC

# for vectorization 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import scipy.sparse

# Import google
from google.cloud import storage

## Elements to define

In [None]:
# Creating list of ys
requested_cols = ['blok',
                  'KF',
                  'V',
                  'S',
                  'CD',
                  'RV',
                  'FP',
                  'SF',
                  'KD',
                  'DF',
                  'EL',
                  'LA',
                  'ALT',
                  'NB'
                 ]

# Creating list of random ys
requested_cols_random = ['blok_random',
                         'KF_random',
                         'V_random',
                         'S_random',
                         'CD_random',
                         'RV_random', 
                         'FP_random', 
                         'SF_random', 
                         'KD_random', 
                         'DF_random', 
                         'EL_random', 
                         'LA_random', 
                         'ALT_random', 
                         'NB_random', 
                         ]

In [None]:
# Creating function
def create_x_y(dataframe):
    
    # subset features to tf-idf vectors
    X = dataframe.iloc[:, dataframe.columns.str.find("no_words_after").argmax()+1:]
    
    # Valid ys
    valid_ys = [c for c in requested_cols if c in dataframe]
    valid_ys_random = [c for c in requested_cols_random if c in dataframe]
    
    ys = dataframe[valid_ys]
    
    y_rands = dataframe[valid_ys_random]
    
    # Modify X - making it sparse
    #X = scipy.sparse.csr_matrix(X.sparse.to_coo())
    
    # creating lists
    y_list = ys.columns
    y_rand_list = y_rands.columns
    
    return X, ys, y_rands, y_list, y_rand_list

In [None]:
#Creating evaluation function
def model_evaluation_df(y_train, y_train_pred, y_val, y_val_pred):
    '''
    Calculates evaluation metrics for a training and validation set.
        
        Parameters:
            X_train (dataframe): Training set with the features that model uses
            y_train (dataframe): Training set with the targets that model uses
            X_val (dataframe): Validation set with the features that model uses
            y_val (dataframe): Validation set with the targets that model uses
            
        Returns:
            model_evaluation_dict (dictionary): Dictionary with scores for the following performance metrics: AUC, AP, BS, Precision, Recall, FP rate and TP rate
    '''
    
    # compute the performance metrics for the training set
    accuracy_train = metrics.accuracy_score(y_train, y_train_pred.round())
    AUC_train = metrics.roc_auc_score(y_train, y_train_pred)
    AP_train = metrics.average_precision_score(y_train, y_train_pred)
    BS_train = metrics.brier_score_loss(y_train, y_train_pred)
    precision_train, recall_train, _ = metrics.precision_recall_curve(y_train, y_train_pred)
    FP_rate_train, TP_rate_train, _ = metrics.roc_curve(y_train, y_train_pred)

    # compute the performance metrics for the validation set
    accuracy_val = metrics.accuracy_score(y_val, y_val_pred.round())
    AUC_val = metrics.roc_auc_score(y_val, y_val_pred)
    AP_val = metrics.average_precision_score(y_val, y_val_pred)
    BS_val = metrics.brier_score_loss(y_val, y_val_pred)
    precision_val, recall_val, _ = metrics.precision_recall_curve(y_val, y_val_pred) 
    FP_rate_val, TP_rate_val, _ = metrics.roc_curve(y_val, y_val_pred)
    
    # Makes a dictionary of all the metrics
    model_evaluation_dict = {"accuracy_train": [accuracy_train],
                             "auc_train": [AUC_train],
                             "ap_train": [AP_train],
                             "bs_train": [BS_train],
                             "precision_train": [precision_train],
                             "recall_train": [recall_train],
                             "FP_rate_train": [FP_rate_train],
                             "TP_rate_train": [TP_rate_train],
                             "accuracy_test": [accuracy_val],
                             "auc_test": [AUC_val],
                             "ap_test": [AP_val],
                             "bs_test": [BS_val],
                             "precision_test": [precision_val],
                             "recall_test": [recall_val],
                             "FP_rate_test": [FP_rate_val],
                             "TP_rate_test": [TP_rate_val]
                            }

    return model_evaluation_dict

In [None]:
def data_splitter(X,y):
    #Splitting into test and dev data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, random_state=42)    
    
    return X_train, y_train, X_test, y_test

In [None]:
list_of_scores = ['blok_scores',
                  'KF_scores',
                  'V_scores',
                  'S_scores',
                  'CD_scores',
                  'RV_scores',
                  'FP_scores',
                  'SF_scores',
                  'KD_scores',
                  'DF_scores',
                  'EL_scores',
                  'LA_scores',
                  'ALT_scores',
                  'NB_scores'
                 ]

list_of_scores_random = ['blok_random_scores',
                         'KF_random_scores',
                         'V_random_scores',
                         'S_random_scores',
                         'CD_random_scores',
                         'RV_random_scores', 
                         'FP_random_scores', 
                         'SF_random_scores', 
                         'KD_random_scores', 
                         'DF_random_scores', 
                         'EL_random_scores', 
                         'LA_random_scores', 
                         'ALT_random_scores', 
                         'NB_random_scores', 
                         ]

list_of_preds = ['blok_preds',
                  'KF_preds',
                  'V_preds',
                  'S_preds',
                  'CD_preds',
                  'RV_preds',
                  'FP_preds',
                  'SF_preds',
                  'KD_preds',
                  'DF_preds',
                  'EL_preds',
                  'LA_preds',
                  'ALT_preds',
                  'NB_preds'
                 ]

list_of_random_preds = ['blok_random_preds',
                         'KF_random_preds',
                         'V_random_preds',
                         'S_random_preds',
                         'CD_random_preds',
                         'RV_random_preds', 
                         'FP_random_preds', 
                         'SF_random_preds', 
                         'KD_random_preds', 
                         'DF_random_preds', 
                         'EL_random_preds', 
                         'LA_random_preds', 
                         'ALT_random_preds', 
                         'NB_random_preds', 
                         ]

## Importing data

In [None]:
all_speeches_yearly_tfids_dfs = pd.read_pickle('gs://speciale_ml/all_speeches_yearly_tfids_dfs.pkl')
velfærd_speeches_yearly_tfids_dfs = pd.read_pickle('gs://speciale_ml/velfærd_speeches_yearly_tfids_dfs.pkl')
skat_speeches_yearly_tfids_dfs = pd.read_pickle('gs://speciale_ml/skat_speeches_yearly_tfids_dfs.pkl')
klima_speeches_yearly_tfids_dfs = pd.read_pickle('gs://speciale_ml/klima_speeches_yearly_tfids_dfs.pkl')
udlændinge_speeches_yearly_tfids_dfs = pd.read_pickle('gs://speciale_ml/udlændinge_speeches_yearly_tfids_dfs.pkl')
scores_df_empty = pd.read_pickle('gs://speciale_ml/scores_df_empty.pkl')
preds_df_empty = pd.read_pickle('gs://speciale_ml/preds_df_empty.pkl')

## Small data adjustments

In [None]:
for list_df in [udlændinge_speeches_yearly_tfids_dfs,
                all_speeches_yearly_tfids_dfs,
                klima_speeches_yearly_tfids_dfs,
                velfærd_speeches_yearly_tfids_dfs,
                skat_speeches_yearly_tfids_dfs
               ]:
    
    for df in list_df:
        cols = []
        count = 1
        
        for column in df.columns:
            if column == 'blok':
                cols.append(f'blok{count}')
                count+=1
                continue
            cols.append(column)
        df.columns = cols

        df.rename(columns={'blok1': 'blok'}, inplace=True)

        df['blok']

## Naive Bayes - models

### All data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(all_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_all = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in all_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_all.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_all[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_all_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_all[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_all_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Velfærd data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(velfærd_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_velfærd = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in velfærd_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_velfærd.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_velfærd[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_velfærd_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_velfærd[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_velfærd_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Skat data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(skat_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_skat = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in skat_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_skat.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_skat[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_skat_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_skat[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_skat_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Udlændinge data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(udlændinge_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_udlændinge = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in udlændinge_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_udlændinge.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_udlændinge[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_udlændinge_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_udlændinge[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_udlændinge_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Klima data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(klima_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_klima = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in klima_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_klima.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_klima[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_klima_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_klima[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_klima_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

## Naive Bayes Random - models

### All data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(all_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_all = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in all_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_rand_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_all.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_all[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_all_speeches_plot_rand.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_all[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_all_speeches_preds_rand.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Velfærd data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(velfærd_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_velfærd = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in velfærd_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_rand_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_velfærd.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_velfærd[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_velfærd_speeches_plot_rand.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_velfærd[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_velfærd_speeches_preds_rand.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Skat data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(skat_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_skat = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in skat_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_rand_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_skat.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_skat[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_skat_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_skat[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_skat_speeches_preds_rand.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Udlændinge data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(udlændinge_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_udlændinge = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in udlændinge_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_rand_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_udlændinge.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_udlændinge[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_udlændinge_speeches_plot_rand.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_udlændinge[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_udlændinge_speeches_preds_rand.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Klima data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(klima_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
NB_results_klima = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        NB_yearly_scores = []
        NB_yearly_preds = []

        # Looping over alle the years
        for yearly_df in klima_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_rand_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = ComplementNB()

                # Set paramater ranges
                params = {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
                          }

                # Setting up grid_search
                search = GridSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            NB_yearly_scores.append(scores_df)
            NB_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        NB_results_klima.update({f'{ele_y}_scores': NB_yearly_scores,
                               f'{ele_y}_preds': NB_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(NB_results_klima[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('NB_klima_speeches_plot_rand.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in NB_results_klima[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('NB_klima_speeches_preds_rand.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

## Random Forest - models

### All data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(all_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
RF_results_all = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        RF_yearly_scores = []
        RF_yearly_preds = []

        # Looping over alle the years
        for yearly_df in all_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = RandomForestClassifier(class_weight="balanced")

                # Set paramater ranges
                params = {"n_estimators": [16,32,64,128],
                          "max_depth": [3,5,7,9],
                          "min_samples_split": [2,4,6],
                          "min_samples_leaf": [1,2,4]
                         }

                # Setting up grid_search
                search = RandomizedSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            RF_yearly_scores.append(scores_df)
            RF_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        RF_results_all.update({f'{ele_y}_scores': RF_yearly_scores,
                               f'{ele_y}_preds': RF_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(RF_results_all[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('RF_all_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in RF_results_all[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('RF_all_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Velfærd data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(velfærd_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
RF_results_velfærd = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        RF_yearly_scores = []
        RF_yearly_preds = []

        # Looping over alle the years
        for yearly_df in velfærd_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = RandomForestClassifier(class_weight="balanced")

                # Set paramater ranges
                params = {"n_estimators": [16,32,64,128],
                          "max_depth": [3,5,7,9],
                          "min_samples_split": [2,4,6],
                          "min_samples_leaf": [1,2,4]
                         }

                # Setting up grid_search
                search = RandomizedSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            RF_yearly_scores.append(scores_df)
            RF_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        RF_results_velfærd.update({f'{ele_y}_scores': RF_yearly_scores,
                               f'{ele_y}_preds': RF_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(RF_results_velfærd[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('RF_velfærd_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in RF_results_velfærd[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('RF_velfærd_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Skat data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(skat_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
RF_results_skat = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        RF_yearly_scores = []
        RF_yearly_preds = []

        # Looping over alle the years
        for yearly_df in skat_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = RandomForestClassifier(class_weight="balanced")

                # Set paramater ranges
                params = {"n_estimators": [16,32,64,128],
                          "max_depth": [3,5,7,9],
                          "min_samples_split": [2,4,6],
                          "min_samples_leaf": [1,2,4]
                         }

                # Setting up grid_search
                search = RandomizedSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            RF_yearly_scores.append(scores_df)
            RF_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        RF_results_skat.update({f'{ele_y}_scores': RF_yearly_scores,
                               f'{ele_y}_preds': RF_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(RF_results_skat[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('RF_skat_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in RF_results_skat[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('RF_skat_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Udlændinge data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(udlændinge_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
RF_results_udlændinge = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        RF_yearly_scores = []
        RF_yearly_preds = []

        # Looping over alle the years
        for yearly_df in udlændinge_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = RandomForestClassifier(class_weight="balanced")

                # Set paramater ranges
                params = {"n_estimators": [16,32,64,128],
                          "max_depth": [3,5,7,9],
                          "min_samples_split": [2,4,6],
                          "min_samples_leaf": [1,2,4]
                         }

                # Setting up grid_search
                search = RandomizedSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            RF_yearly_scores.append(scores_df)
            RF_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        RF_results_udlændinge.update({f'{ele_y}_scores': RF_yearly_scores,
                               f'{ele_y}_preds': RF_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(RF_results_udlændinge[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('RF_udlændinge_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in RF_results_udlændinge[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('RF_udlændinge_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)

### Klima data

In [None]:
# Create X and ys
_, _, _, y_list, y_list_rand = create_x_y(klima_speeches_yearly_tfids_dfs[0])

# Create dicionary with predictions from each comparison
RF_results_klima = {}

with parallel_backend('multiprocessing',n_jobs=16):

    # Looping over different y values
    for ele_y, ele_y_rand in zip(y_list, y_list_rand):

        # Start time one comparison
        start_time_comparison = time.time()

        # Creating yearly scores
        RF_yearly_scores = []
        RF_yearly_preds = []

        # Looping over alle the years
        for yearly_df in klima_speeches_yearly_tfids_dfs:

            # Start time one year
            start_time_year = time.time()

            # Create yearly X and Ys
            X, ys, y_rands, _, _ = create_x_y(yearly_df)
            y_yearly = ys[str(ele_y)]
            y_rand_yearly = y_rands[str(ele_y_rand)]
            X_yearly = X
            
            #Creating dicts
            dict_temp = {}
            predictions_dict = {'preds_train': [], 'preds_test': [], 'best_model': [], 'index_original': []}
            
            # If there is no data for the year
            if y_yearly.sum() == 0:
                
                #Empyt DFs
                scores_df = scores_df_empty
                predictions_df = preds_df_empty
            
            else:
                # Split data into train and test
                x_train, x_test, y_train, y_test = train_test_split(X_yearly, y_yearly, test_size = 0.25, random_state=42)

                # Initiatite cross validation
                cv = StratifiedKFold(n_splits=5, shuffle = True)

                # Define the model
                model = RandomForestClassifier(class_weight="balanced")

                # Set paramater ranges
                params = {"n_estimators": [16,32,64,128],
                          "max_depth": [3,5,7,9],
                          "min_samples_split": [2,4,6],
                          "min_samples_leaf": [1,2,4]
                         }

                # Setting up grid_search
                search = RandomizedSearchCV(model, params, scoring='roc_auc', cv=cv, refit=True)

                # Fitting the best model
                result = search.fit(x_train, y_train)

                # Getting the best model
                best_model = result.best_estimator_

                # Predicting probability
                y_train_pred = best_model.predict_proba(x_train)[:,1]
                y_test_pred = best_model.predict_proba(x_test)[:,1]

                # Appending scores
                dict_temp = model_evaluation_df(y_train, y_train_pred, y_test, y_test_pred)

                # Creating data frame with predicted values
                predictions_dict['preds_train'].append(y_train_pred)
                predictions_dict['preds_test'].append(y_test_pred)
                predictions_dict['best_model'].append(best_model)
                predictions_dict['index_original'].append(y_test.index.values)
            
                # Creating Data Frame with metrics                 
                scores_df = pd.DataFrame.from_dict(dict_temp, orient = 'columns')
                predictions_df = pd.DataFrame.from_dict(predictions_dict, orient = 'columns')

            # Appending monthly scores
            RF_yearly_scores.append(scores_df)
            RF_yearly_preds.append(predictions_df)
            
            # Print time for one year
            end_time_year = time.time()
            run_time = (end_time_year - start_time_year)/60
            print(f'minutes to run one year: {run_time:.2}')

        # Saving the results from one comparison
        RF_results_klima.update({f'{ele_y}_scores': RF_yearly_scores,
                               f'{ele_y}_preds': RF_yearly_preds})

        # One comparison time
        end_time_comparison = time.time()
        run_time = (end_time_comparison - start_time_comparison)/60
        print(f'minutes to run {ele_y}: {run_time:.2}')

Saving data

In [None]:
# List of dataframes with scores for each group
group_scores = []

# Loop over dictionary
for group in list_of_scores:
    
    one_group = pd.concat(RF_results_klima[group], ignore_index=True)
    one_group['model'] = group 

    # Appending df to a list
    group_scores.append(one_group)
    
with open('RF_klima_speeches_plot.pkl', 'wb') as f:
    pickle.dump(group_scores, f, protocol=4)

In [None]:
# List of dataframes with scores for each group
group_preds = []

# Loop over dictionary
for group in list_of_preds:
    
    # Create list with df for each year
    preds_yearly = []
    
    # Loop over year
    for year in RF_results_klima[group]:
        
        # Create new df
        preds_test = list(year['preds_test'].explode())
        index_org = list(year['index_original'].explode())
        dict_temp = {"preds_test": preds_test,
                     "index_org": index_org}
        # Create df
        preds_yearly.append(pd.DataFrame(dict_temp))
        
    # Appending df to a list
    group_preds.append(preds_yearly)

with open('RF_klima_speeches_preds.pkl', 'wb') as f:
    pickle.dump(group_preds, f, protocol=4)