In [None]:
# packages
import catboost
import gensim.models
import gap_statistic
import nltk
import re
import copy
import pandas as pd
import numpy as np
import gensim.downloader as wv
from gensim import utils
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
from nltk import download
from nltk.cluster import KMeansClusterer
download('stopwords')

In [None]:

# class for preprocessing dta - stemming, tokenizing, removing stop words
porter = PorterStemmer()

class PrepareSentance():
    def __init__(self, df, text_column):
        self.df = df
        self.text_column = text_column
        self.processed_df = []
        
    def tokenize(self, stem=True, remove_stopwords=True):
        df = self.df.copy()
        text_column = self.text_column
        processed_df = [utils.simple_preprocess(t) for t in df[text_column]]
        if remove_stopwords:
            stop_words = set(stopwords.words('english')) 
            for i in range(len(processed_df)):
                processed_df[i] = [w for w in processed_df[i] if not w in stop_words]
        if stem:
            for i in range(len(processed_df)):
                processed_df[i] = [porter.stem(p) for p in processed_df[i]]
        self.processed_df = processed_df
        return processed_df
      

# classes for preprocessing training/test sets
class TabularDescription():
    def __init__(self, dataset, text_column, word_clusters, single_words=None):
        self.set = dataset
        self.text_column = text_column
        self.word_clusters = word_clusters
        self.single_words = single_words
        self.x = None
        self.y = None
        self.training_features = None
        if 'congress_gov_major_topic' in list(dataset.columns):
            congress_subject_area = dataset[['billid','congress_gov_major_topic']]
            congress_subject_area = congress_subject_area.set_index(keys='billid')
            self.congress_subject_area = congress_subject_area
        else:
            self.congress_subject_area = None
        
    def get_dataset(self):
        return self.set
    
    def get_text_column(self):
        return self.text_column
    
    def get_word_clusters(self):
        return self.word_clusters
    
    def get_single_words(self):
        return self.single_words
    
    def get_congress_subject_area(self):
        return self.congress_subject_area
    
    def get_training_features(self):
        tsf = copy.deepcopy(self.training_features)
        return tsf
      
        
class TabularDescriptionTrain(TabularDescription):
    def __init__(self, dataset, text_column, word_clusters, single_words=None):
        super().__init__(dataset, text_column, word_clusters, single_words=single_words)
    
    def prepare_set_for_training(self, stem=True, remove_stopwords=True):
        ts = self.get_dataset()
        tc = self.get_text_column()
        ts_pp = PrepareSentance(df=ts, text_column=tc)
        ts_t = ts_pp.tokenize(stem=stem, remove_stopwords=remove_stopwords)
        
        billid = [[ts.billid[b]] * len(ts_pp.processed_df[b]) for b in range(len(ts_pp.processed_df))]
        ts_train = pd.DataFrame({
            'billid' : [item for bill in billid for item in bill], 
            'term' : [term for title in ts_pp.processed_df for term in title]
        })
            
        word_clusters = self.get_word_clusters()    
        ts_train = ts_train.merge(right=word_clusters, how='left')
        ts_train = ts_train.astype(str)
        ts_train['cluster_name'] = 'c_'
        ts_train.cluster_name = ts_train.cluster_name.str.cat(ts_train.cluster)
        
        ts_dtm = ts_train.groupby(['billid', 'cluster_name']).size().reset_index()
        ts_dtm = ts_dtm.rename(columns={0:'n'})
        ts_dtm = ts_dtm.pivot(index="billid", columns="cluster_name", values="n").fillna(0)
        ts_dtm = ts_dtm.drop(labels='c_nan', axis=1)
        
        sw = self.get_single_words()                
        if sw is not None:
            ts_sw = ts_train.merge(right=sw, how='inner')
            ts_sw = pd.DataFrame(ts_sw.groupby(['billid', 'term']).size())
            ts_sw = ts_sw.reset_index()
            ts_sw.columns = ['billid', 'term', 'n']
            ts_sw = ts_sw.pivot(index="billid", columns="term", values="n")
            ts_dtm = ts_dtm.merge(right=ts_sw, left_index=True, right_index=True, how='left').fillna(0)
            
        sa = self.get_congress_subject_area()
        if sa is not None:
            ts_dtm = ts_dtm.merge(right=sa, left_index=True, right_index=True, how='left')
        
        y = pd.DataFrame({'billid':ts_dtm.index}).merge(right=ts[['billid', 'minor']], 
                                                        how='left', on='billid')
        y = y.astype('str')
        y = list(y['minor'])
        
        self.x = ts_dtm
        self.y = y
        self.training_features = list(ts_dtm.columns)
        return ts_dtm, y
      
    
class TabularDescriptionTest(TabularDescription):
    def __init__(self, dataset, text_column, word_clusters, training_features, single_words=None):
        super().__init__(dataset, text_column, word_clusters, single_words=single_words)
        self.training_features = training_features
        
    def prepare_set_for_evaluation(self, stem=True, remove_stopwords=True):
        ts = self.get_dataset()
        tc = self.get_text_column()
        ts_pp = PrepareSentance(df=ts, text_column=tc)
        ts_t = ts_pp.tokenize(stem=stem, remove_stopwords=remove_stopwords)
        
        billid = [[ts.billid[b]] * len(ts_pp.processed_df[b]) for b in range(len(ts_pp.processed_df))]
        ts_train = pd.DataFrame({
            'billid' : [item for bill in billid for item in bill], 
            'term' : [term for title in ts_pp.processed_df for term in title]
        })
            
        word_clusters = self.get_word_clusters()    
        ts_train = ts_train.merge(right=word_clusters, how='left')
        ts_train = ts_train.astype(str)
        ts_train['cluster_name'] = 'c_'
        ts_train.cluster_name = ts_train.cluster_name.str.cat(ts_train.cluster)
        
        ts_dtm = ts_train.groupby(['billid', 'cluster_name']).size().reset_index()
        ts_dtm = ts_dtm.rename(columns={0:'n'})
        ts_dtm = ts_dtm.pivot(index="billid", columns="cluster_name", values="n").fillna(0)
        ts_dtm = ts_dtm.drop(labels='c_nan', axis=1)
        
        sw = self.get_single_words()                
        if sw is not None:
            ts_sw = ts_train.merge(right=sw, how='inner')
            ts_sw = pd.DataFrame(ts_sw.groupby(['billid', 'term']).size())
            ts_sw = ts_sw.reset_index()
            ts_sw.columns = ['billid', 'term', 'n']
            ts_sw = ts_sw.pivot(index="billid", columns="term", values="n")
            ts_dtm = ts_dtm.merge(right=ts_sw, left_index=True, right_index=True, how='left').fillna(0)
        
        ts_columns = set(ts_dtm.columns)
        training_features = set(self.get_training_features())
        missing_features = training_features.difference(ts_columns)
        number_of_columns = len(missing_features)
        if number_of_columns > 0:
            number_of_rows = len(ts_dtm)
            missing_features_a = np.zeros((number_of_rows, number_of_columns))
            missing_features_df = pd.DataFrame(missing_features_a)
            missing_features_df.columns = missing_features
            missing_features_df.index = ts_dtm.index
            ts_dtm = pd.concat([ts_dtm, missing_features_df], axis = 1)
        
        columns_to_keep = copy.deepcopy(self.get_training_features())
        sa = self.get_congress_subject_area()
        if sa is not None:
            columns_to_keep.remove('congress_gov_major_topic')
        ts_dtm = ts_dtm[columns_to_keep]
        
        if sa is not None:
            ts_dtm = ts_dtm.merge(right=sa, left_index=True, right_index=True, how='left')
        
        y = pd.DataFrame({'billid':ts_dtm.index}).merge(right=ts[['billid', 'minor']], 
                                                        how='left', on='billid')
        y = y.astype('str')
        y = list(y['minor'])
        
        self.x = ts_dtm
        self.y = y
        return ts_dtm, y
    
    
# function for evaluating model performance    
def get_classification_results(cbm_model, x, y=None):
    pred_prob_set = pd.DataFrame(cbm_model.predict_proba(x))
    pred_prob_set.columns = cbm_model.classes_
    pred_prob_set['probability'] = pred_prob_set.max(1)
    pred_prob_set['predicted'] = cbm_model.predict(x)
    if y is not None:
        pred_prob_set['observed'] = y
        pred_prob_set['match'] = [True if pred_prob_set['observed'][i] == pred_prob_set['predicted'][i] else False for i in pred_prob_set.index]
    pred_prob_set['billid'] = list(x.index)
    return pred_prob_set

In [None]:
# files (prepared in advance)
population_csv_file = 'population_93_114.csv'
training_csv_file = 'training_80.csv'
evaluation_csv_file = 'evaluation_set_80.csv'
test_csv_file = 'test_set_80.csv'
# data
population = pd.read_csv(population_csv_file)
training_set = pd.read_csv(training_csv_file)
evaluation_set = pd.read_csv(evaluation_csv_file)
test_set = pd.read_csv(test_csv_file)

In [None]:
# word vectors on population data
population = population.query('congress > 107')
population['title'] = [re.sub('united states code|other purposes', '', x, flags=re.IGNORECASE) for x in population['title']]

population_pp = PrepareSentance(df=population, text_column='title')
population_t = population_pp.tokenize()
for i in population_t:
    for j in ['amend', 'act', 'bill', 'oper', 'implement', 'program', 'titl', 
              'administration', 'american', 'institut','department', 'secretari', 'offic']:
        try:
            i.remove(j)
        except:
            pass

population_t
population_gm = gensim.models.Word2Vec(sentences=population_t, min_count=3, vector_size=300)
dfwv = pd.DataFrame(population_gm.wv.vectors)
dfwv.index = population_gm.wv.index_to_key

In [None]:
# K-means clustering based on word vectors
optimalK = gap_statistic.OptimalK(n_jobs=4, parallel_backend='joblib')
n_clusters = optimalK(dfwv, cluster_array=np.arange(1, 350))
n_clusters
optimalK.plot_results()
X = population_gm.wv.vectors
NUM_CLUSTERS=n_clusters
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
word_clusters = pd.DataFrame({'term':population_gm.wv.index_to_key,'cluster':assigned_clusters})
word_clusters.to_csv('word_clusters_edit.csv')

In [None]:
# list of terms to use as single words, outside of clusters
single_words = pd.DataFrame({'term':['safeti', 'transit', 'secur', 'effici', 'job', 'vehicl', 'reimburs', 'construct'
                                    'research', 'school', 'compet', 'youth', 'young', 'clean', 'production',
                                    'power']})

In [None]:
# training
training_ins = TabularDescriptionTrain(dataset=training_set, text_column='title', word_clusters=word_clusters, 
                                       single_words=single_words)
ts_x, ts_y = training_ins.prepare_set_for_training()
training_ins.get_training_features()

In [None]:
# evaluation
evaluation_ins = TabularDescriptionTest(dataset=evaluation_set, text_column='title', 
                                        word_clusters=word_clusters, training_features=training_ins.get_training_features(), 
                                        single_words=single_words)
ev_x, ev_y = evaluation_ins.prepare_set_for_evaluation()

In [None]:
# test set
test_ins = TabularDescriptionTest(dataset=test_set, text_column='title', 
                                        word_clusters=word_clusters, training_features=training_ins.get_training_features(), 
                                        single_words=single_words)
test_x, test_y = test_ins.prepare_set_for_evaluation()

In [None]:
# train catboost
major_model_80 = catboost.CatBoostClassifier(iterations=18000, max_depth=10, 
                                             learning_rate=0.025, l2_leaf_reg=0.75, 
                                             loss_function='MultiClassOneVsAll',
                                            rsm=0.2,
                                            cat_features=['congress_gov_major_topic']) 

major_model_80_mt = major_model_80.fit(X=training_80_dtm, y=y_training_80, 
                                       early_stopping_rounds=50, 
                                       eval_set=(validation_80_dtm, y_validation_80))

In [None]:
# shrink number of iterations based on early detection of overfit
major_model_80_mt.shrink(9234)

In [None]:
# accuracy
ts_set_acc = major_model_80_mt.score(X=training_80_dtm, y=y_training_80)
eval_acc = major_model_80_mt.score(X=validation_80_dtm, y=y_validation_80)
test_set_acc = major_model_80_mt.score(X=test_set_80_dtm, y=y_test_set_80)

print(f'training set accuracy: {ts_set_acc}')
print(f'validation set accuracy: {eval_acc}')
print(f'test set accuracy: {test_set_acc}')

In [None]:
# predictions
pred = get_classification_results(cbm_model = model, x=test_set_80_dtm, y=y_test_set_80)