In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
EP6 = pd.read_csv("/content/drive/MyDrive/ Thesis 2023/EP6_total3.csv")
EP7 = pd.read_csv("/content/drive/MyDrive/ Thesis 2023/EP7_total3.csv")
EP8 = pd.read_csv("/content/drive/MyDrive/ Thesis 2023/EP8_total3.csv")


In [None]:
EP6_2 = pd.read_csv("/content/drive/MyDrive/ Thesis 2023/EP6_RCVs.csv")
EP7_2 = pd.read_csv("/content/drive/MyDrive/ Thesis 2023/EP7_RCVs.csv")
EP8_2 = pd.read_csv("/content/drive/MyDrive/ Thesis 2023/EP8_RCVs.csv")

In [None]:
EP8 = EP8.rename(columns = {"De/Policy area":"Policy_area"})

In [None]:
EP6 = EP6.drop(["Unnamed: 0"],axis=1)
EP7 = EP7.drop(["Unnamed: 0"],axis=1)
EP8 = EP8.drop(["Unnamed: 0"],axis=1)

In [None]:
import numpy as np


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

from nltk import word_tokenize, WordNetLemmatizer#, pos_tag, pos_tag_sents, sent_tokenize
from nltk.corpus import stopwords#, wordnet

class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
      res=[]
      for token in word_tokenize(doc):
        word=''.join(filter(str.isalnum, token))
        if ((word not in self.ignore_tokens) and (len(self.wnl.lemmatize(word))>2) and not (any(char.isdigit() for char in word))): 
          res.append(self.wnl.lemmatize(word))
      return res

stop_words = stopwords.words('english')
#stop_words.extend(["more","stop","words"])

tokenizer = LemmaTokenizer()
token_stop = list(set(tokenizer(' '.join(stop_words))))
token_stop.sort()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


## Preprocessing of the data EP6

In [None]:
EP6["raw_text"].isna().sum() #quitar las instancias que no tienen texto


409

In [None]:
print(EP6["Type of Vote"].isna().sum())
print(EP6["Policy_area"].isna().sum())
print(EP6["Title"].isna().sum())

5
0
0


In [None]:
EP6  = EP6.dropna(subset=['raw_text'])
# Fill NaN values with empty strings
EP6.fillna('', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  EP6.fillna('', inplace=True)


In [None]:
EP6.shape

(844, 945)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
# define the columns to transform
text_columns = ['raw_text', 'Type of Vote', 'Policy_area', 'Title']

# define the transformer
transformer = ColumnTransformer(
    transformers=[
        ('raw_text', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'raw_text'),
        ('Type of Vote', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Type of Vote'),
        ('Policy_area', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Policy_area'),
        ('Title', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Title')
    ])

# fit and transform the data
tfidf_array = transformer.fit_transform(EP6[text_columns])






In [None]:
print(tfidf_array.shape)

(844, 6488)


In [None]:
from sklearn.decomposition import TruncatedSVD
number_dimensions = 21
lsa_model = TruncatedSVD(n_components=number_dimensions, random_state=420)
lsa_comps = lsa_model.fit_transform(tfidf_array)
print(lsa_model.explained_variance_ratio_.sum())

0.6830735446941447


## Create a model for each MEP--> EP6


Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Create a dictionary to store MEP names as keys and accuracy values as values
accuracy_results = {}

# Loop through each MEP column and train a separate model for each MEP
for col in EP6.columns[5:]:
    # Step 1: Prepare the data
    X = lsa_comps # Input features (TF-IDF + truncated SVD features)
    y = EP6[col].values # Target variable (voting behavior of the MEP)
    
    # Check the number of unique classes in the target variable
    num_classes = len(np.unique(y))
    if num_classes < 2:
        print(f"Skipping MEP {col} due to only one class present.")
        continue
    
    # Step 2: Split the data into training and testing sets
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 3: Choose and configure the model
    model = Pipeline([
        ('scaler', StandardScaler()), # You can apply additional preprocessing steps as needed
        ('classifier', LogisticRegression()) # Change the classifier as needed (e.g., DecisionTreeClassifier, RandomForestClassifier, SVC, MLPClassifier)
    ])
    
    # Step 4: Train the model
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    #importances = np.zeros(X.shape[1])
    
    #for train_idx, test_idx in cv.split(X, y):
     #   model.fit(X[train_idx], y[train_idx])
      #  importances += model.named_steps['classifier'].feature_importances_

   # importances /= cv.n_splits
    #feature_names = ['feature_' + str(i) for i in range(X.shape[1])]
    #feature_importances = dict(zip(feature_names, importances))

    accuracy_results[col] = {'accuracy': np.mean(accuracy_scores)}#, 'feature_importances': feature_importances}
# Convert the accuracy_results dictionary to a DataFrame
accuracy_df = pd.DataFrame.from_dict(accuracy_results, orient='index')

# Add the MEP column to the DataFrame
accuracy_df = accuracy_df.reset_index().rename(columns={'index': 'MEP'})

# Split the feature_importances column into separate columns for each feature
#accuracy_df = accuracy_df.join(pd.DataFrame(accuracy_df['feature_importances'].to_dict()).T)


In [None]:
accuracy_df.describe()

Unnamed: 0,accuracy
count,939.0
mean,0.711575
std,0.133015
min,0.335313
25%,0.618481
50%,0.733362
75%,0.812792
max,0.991709


In [None]:
# Get the minimum and maximum accuracy values
min_accuracy = accuracy_df['accuracy'].min()
max_accuracy = accuracy_df['accuracy'].max()
median_accuracy = accuracy_df['accuracy'].median()
mean_accuracy = accuracy_df['accuracy'].mean()

# Print the results
print(f"Minimum Accuracy: {min_accuracy}")
print(f"Maximum Accuracy: {max_accuracy}")
print(f"Median Accuracy: {median_accuracy}")
print(f"Mean Accuracy: {mean_accuracy}")


Minimum Accuracy: 0.335312764158918
Maximum Accuracy: 0.9917089320935475
Median Accuracy: 0.7333615102845872
Mean Accuracy: 0.7115749955514025


Linear SVC

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

# Create a dictionary to store MEP names as keys and accuracy values as values
accuracy_results = {}

# Loop through each MEP column and train a separate model for each MEP
for col in EP6.columns[5:]:
    # Step 1: Prepare the data
    X = lsa_comps # Input features (TF-IDF + truncated SVD features)
    y = EP6[col].values # Target variable (voting behavior of the MEP)
    
    # Check the number of unique classes in the target variable
    num_classes = len(np.unique(y))
    if num_classes < 2:
        print(f"Skipping MEP {col} due to only one class present.")
        continue
    
    # Step 2: Split the data into training and testing sets
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 3: Choose and configure the model
    model = Pipeline([
        ('scaler', StandardScaler()), # You can apply additional preprocessing steps as needed
        ('classifier',svm.LinearSVC()) # Change the classifier as needed (e.g., DecisionTreeClassifier, RandomForestClassifier, SVC, MLPClassifier)
    ])
    
    # Step 4: Train the model
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    #importances = np.zeros(X.shape[1])
    
    #for train_idx, test_idx in cv.split(X, y):
     #   model.fit(X[train_idx], y[train_idx])
      #  importances += model.named_steps['classifier'].feature_importances_

   # importances /= cv.n_splits
    #feature_names = ['feature_' + str(i) for i in range(X.shape[1])]
    #feature_importances = dict(zip(feature_names, importances))

    accuracy_results[col] = {'accuracy': np.mean(accuracy_scores)}#, 'feature_importances': feature_importances}
# Convert the accuracy_results dictionary to a DataFrame
accuracy_df = pd.DataFrame.from_dict(accuracy_results, orient='index')

# Add the MEP column to the DataFrame
accuracy_df = accuracy_df.reset_index().rename(columns={'index': 'MEP'})

# Split the feature_importances column into separate columns for each feature
#accuracy_df = accuracy_df.join(pd.DataFrame(accuracy_df['feature_importances'].to_dict()).T)


In [None]:
accuracy_df.describe()

Unnamed: 0,accuracy
count,939.0
mean,0.714518
std,0.132594
min,0.337687
25%,0.62027
50%,0.735771
75%,0.814007
max,0.992892


In [None]:
# Get the minimum and maximum accuracy values
min_accuracy = accuracy_df['accuracy'].min()
max_accuracy = accuracy_df['accuracy'].max()
median_accuracy = accuracy_df['accuracy'].median()
mean_accuracy = accuracy_df['accuracy'].mean()

# Print the results
print(f"Minimum Accuracy: {min_accuracy}")
print(f"Maximum Accuracy: {max_accuracy}")
print(f"Median Accuracy: {median_accuracy}")
print(f"Mean Accuracy: {mean_accuracy}")


Minimum Accuracy: 0.33768667230205696
Maximum Accuracy: 0.9928923640462102
Median Accuracy: 0.7357706396167936
Mean Accuracy: 0.7145175589818008


## Preprocessing of the data EP7

In [None]:
EP7["raw_text"].isna().sum() #quitar las instancias que no tienen texto


0

In [None]:
print(EP7["Type of Vote"].isna().sum())
print(EP7["Policy_area"].isna().sum())
print(EP7["Title"].isna().sum())

0
0
0


In [None]:
# Fill NaN values with empty strings
EP7.fillna('', inplace=True)


In [None]:
EP7.shape

(1662, 858)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
# define the columns to transform
text_columns = ['raw_text', 'Type of Vote', 'Policy_area', 'Title']

# define the transformer
transformer = ColumnTransformer(
    transformers=[
        ('raw_text', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'raw_text'),
        ('Type of Vote', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Type of Vote'),
        ('Policy_area', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Policy_area'),
        ('Title', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Title')
    ])

# fit and transform the data
tfidf_array = transformer.fit_transform(EP7[text_columns])






In [None]:
from sklearn.decomposition import TruncatedSVD
number_dimensions = 20
lsa_model = TruncatedSVD(n_components=number_dimensions, random_state=420)
lsa_comps = lsa_model.fit_transform(tfidf_array)
print(lsa_model.explained_variance_ratio_.sum())

0.7106321990911993


## Create a model for each MEP--> EP7

---



Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Create a dictionary to store MEP names as keys and accuracy values as values
accuracy_results = {}

# Loop through each MEP column and train a separate model for each MEP
for col in EP7.columns[5:]:
    # Step 1: Prepare the data
    X = lsa_comps # Input features (TF-IDF + truncated SVD features)
    y = EP7[col].values # Target variable (voting behavior of the MEP)
    
    # Check the number of unique classes in the target variable
    num_classes = len(np.unique(y))
    if num_classes < 2:
        print(f"Skipping MEP {col} due to only one class present.")
        continue
    
    # Step 2: Split the data into training and testing sets
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 3: Choose and configure the model
    model = Pipeline([
        ('scaler', StandardScaler()), # You can apply additional preprocessing steps as needed
        ('classifier', LogisticRegression()) # Change the classifier as needed (e.g., DecisionTreeClassifier, RandomForestClassifier, SVC, MLPClassifier)
    ])
    
    # Step 4: Train the model
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    #importances = np.zeros(X.shape[1])
    
    #for train_idx, test_idx in cv.split(X, y):
     #   model.fit(X[train_idx], y[train_idx])
      #  importances += model.named_steps['classifier'].feature_importances_

   # importances /= cv.n_splits
    #feature_names = ['feature_' + str(i) for i in range(X.shape[1])]
    #feature_importances = dict(zip(feature_names, importances))

    accuracy_results[col] = {'accuracy': np.mean(accuracy_scores)}#, 'feature_importances': feature_importances}
# Convert the accuracy_results dictionary to a DataFrame
accuracy_df = pd.DataFrame.from_dict(accuracy_results, orient='index')

# Add the MEP column to the DataFrame
accuracy_df = accuracy_df.reset_index().rename(columns={'index': 'MEP'})

# Split the feature_importances column into separate columns for each feature
#accuracy_df = accuracy_df.join(pd.DataFrame(accuracy_df['feature_importances'].to_dict()).T)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Skipping MEP 736 due to only one class present.
Skipping MEP 737 due to only one class present.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
accuracy_df.describe()

Unnamed: 0,accuracy
count,851.0
mean,0.716195
std,0.13004
min,0.355006
25%,0.617925
50%,0.734665
75%,0.823106
max,0.990372


In [None]:
# Get the minimum and maximum accuracy values
min_accuracy = accuracy_df['accuracy'].min()
max_accuracy = accuracy_df['accuracy'].max()
median_accuracy = accuracy_df['accuracy'].median()
mean_accuracy = accuracy_df['accuracy'].mean()

# Print the results
print(f"Minimum Accuracy: {min_accuracy}")
print(f"Maximum Accuracy: {max_accuracy}")
print(f"Median Accuracy: {median_accuracy}")
print(f"Mean Accuracy: {mean_accuracy}")


Minimum Accuracy: 0.3550056080176562
Maximum Accuracy: 0.9903723000108542
Median Accuracy: 0.7346647852671949
Mean Accuracy: 0.7161946097981663


Linear SVC

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

# Create a dictionary to store MEP names as keys and accuracy values as values
accuracy_results = {}

# Loop through each MEP column and train a separate model for each MEP
for col in EP7.columns[5:]:
    # Step 1: Prepare the data
    X = lsa_comps # Input features (TF-IDF + truncated SVD features)
    y = EP7[col].values # Target variable (voting behavior of the MEP)
    
    # Check the number of unique classes in the target variable
    num_classes = len(np.unique(y))
    if num_classes < 2:
        print(f"Skipping MEP {col} due to only one class present.")
        continue
    
    # Step 2: Split the data into training and testing sets
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 3: Choose and configure the model
    model = Pipeline([
        ('scaler', StandardScaler()), # You can apply additional preprocessing steps as needed
        ('classifier',svm.LinearSVC()) # Change the classifier as needed (e.g., DecisionTreeClassifier, RandomForestClassifier, SVC, MLPClassifier)
    ])
    
    # Step 4: Train the model
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    #importances = np.zeros(X.shape[1])
    
    #for train_idx, test_idx in cv.split(X, y):
     #   model.fit(X[train_idx], y[train_idx])
      #  importances += model.named_steps['classifier'].feature_importances_

   # importances /= cv.n_splits
    #feature_names = ['feature_' + str(i) for i in range(X.shape[1])]
    #feature_importances = dict(zip(feature_names, importances))

    accuracy_results[col] = {'accuracy': np.mean(accuracy_scores)}#, 'feature_importances': feature_importances}
# Convert the accuracy_results dictionary to a DataFrame
accuracy_df = pd.DataFrame.from_dict(accuracy_results, orient='index')

# Add the MEP column to the DataFrame
accuracy_df = accuracy_df.reset_index().rename(columns={'index': 'MEP'})

# Split the feature_importances column into separate columns for each feature
#accuracy_df = accuracy_df.join(pd.DataFrame(accuracy_df['feature_importances'].to_dict()).T)


In [None]:
accuracy_df.describe()

Unnamed: 0,accuracy
count,851.0
mean,0.715944
std,0.130398
min,0.351382
25%,0.61733
50%,0.733458
75%,0.823709
max,0.990975


In [None]:
# Get the minimum and maximum accuracy values
min_accuracy = accuracy_df['accuracy'].min()
max_accuracy = accuracy_df['accuracy'].max()
median_accuracy = accuracy_df['accuracy'].median()
mean_accuracy = accuracy_df['accuracy'].mean()

# Print the results
print(f"Minimum Accuracy: {min_accuracy}")
print(f"Maximum Accuracy: {max_accuracy}")
print(f"Median Accuracy: {median_accuracy}")
print(f"Mean Accuracy: {mean_accuracy}")


Minimum Accuracy: 0.3513821049965628
Maximum Accuracy: 0.9909747096494085
Median Accuracy: 0.7334581569521328
Mean Accuracy: 0.7159442823112779


## Preprocessing of the data EP8

In [None]:
EP8["raw_text"].isna().sum() #quitar las instancias que no tienen texto


0

In [None]:
print(EP8["Type of Vote"].isna().sum())
print(EP8["Policy_area"].isna().sum())
print(EP8["Title"].isna().sum())

0
1
0


In [None]:
# Fill NaN values with empty strings
EP8.fillna('', inplace=True)


In [None]:
EP8.shape

(2069, 863)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
# define the columns to transform
text_columns = ['raw_text', 'Type of Vote', 'Policy_area', 'Title']

# define the transformer
transformer = ColumnTransformer(
    transformers=[
        ('raw_text', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'raw_text'),
        ('Type of Vote', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Type of Vote'),
        ('Policy_area', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Policy_area'),
        ('Title', TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, lowercase=True, strip_accents='unicode', min_df=0.05, max_df=0.9, max_features=9000, ngram_range=(1,3)), 'Title')
    ])

# fit and transform the data
tfidf_array = transformer.fit_transform(EP8[text_columns])






In [None]:
from sklearn.decomposition import TruncatedSVD
number_dimensions = 21
lsa_model = TruncatedSVD(n_components=number_dimensions, random_state=420)
lsa_comps = lsa_model.fit_transform(tfidf_array)
print(lsa_model.explained_variance_ratio_.sum())

0.7191294514196767


## Create a model for each MEP--> EP8

---


Changes:
> max_features = 9000 the avg accuracy is 69% with logistic regression and 40 on SVD

> max_features = 9000 the avg accuracy is 71% with logistic regression and 40 on SVD


Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Create a dictionary to store MEP names as keys and accuracy values as values
accuracy_results = {}

# Loop through each MEP column and train a separate model for each MEP
for col in EP8.columns[5:]:
    # Step 1: Prepare the data
    X = lsa_comps # Input features (TF-IDF + truncated SVD features)
    y = EP8[col].values # Target variable (voting behavior of the MEP)
    
    # Check the number of unique classes in the target variable
    num_classes = len(np.unique(y))
    if num_classes < 2:
        print(f"Skipping MEP {col} due to only one class present.")
        continue
    
    # Step 2: Split the data into training and testing sets
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 3: Choose and configure the model
    model = Pipeline([
        ('scaler', StandardScaler()), # You can apply additional preprocessing steps as needed
        ('classifier', LogisticRegression()) # Change the classifier as needed (e.g., DecisionTreeClassifier, RandomForestClassifier, SVC, MLPClassifier)
    ])
    
    # Step 4: Train the model
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    #importances = np.zeros(X.shape[1])
    
    #for train_idx, test_idx in cv.split(X, y):
     #   model.fit(X[train_idx], y[train_idx])
      #  importances += model.named_steps['classifier'].feature_importances_

   # importances /= cv.n_splits
    #feature_names = ['feature_' + str(i) for i in range(X.shape[1])]
    #feature_importances = dict(zip(feature_names, importances))

    accuracy_results[col] = {'accuracy': np.mean(accuracy_scores)}#, 'feature_importances': feature_importances}
# Convert the accuracy_results dictionary to a DataFrame
accuracy_df = pd.DataFrame.from_dict(accuracy_results, orient='index')

# Add the MEP column to the DataFrame
accuracy_df = accuracy_df.reset_index().rename(columns={'index': 'MEP'})

# Split the feature_importances column into separate columns for each feature
#accuracy_df = accuracy_df.join(pd.DataFrame(accuracy_df['feature_importances'].to_dict()).T)


In [None]:
accuracy_df.describe()

Unnamed: 0,accuracy
count,855.0
mean,0.719169
std,0.135453
min,0.376019
25%,0.604883
50%,0.739491
75%,0.835669
max,0.9971


In [None]:
# Get the minimum and maximum accuracy values
min_accuracy = accuracy_df['accuracy'].min()
max_accuracy = accuracy_df['accuracy'].max()
median_accuracy = accuracy_df['accuracy'].median()
mean_accuracy = accuracy_df['accuracy'].mean()

# Print the results
print(f"Minimum Accuracy: {min_accuracy}")
print(f"Maximum Accuracy: {max_accuracy}")
print(f"Median Accuracy: {median_accuracy}")
print(f"Mean Accuracy: {mean_accuracy}")


Minimum Accuracy: 0.3760185282661333
Maximum Accuracy: 0.9971002795615913
Median Accuracy: 0.7394907066240892
Mean Accuracy: 0.7191689108412014


Linear SVC

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

# Create a dictionary to store MEP names as keys and accuracy values as values
accuracy_results = {}

# Loop through each MEP column and train a separate model for each MEP
for col in EP8.columns[5:]:
    # Step 1: Prepare the data
    X = lsa_comps # Input features (TF-IDF + truncated SVD features)
    y = EP8[col].values # Target variable (voting behavior of the MEP)
    
    # Check the number of unique classes in the target variable
    num_classes = len(np.unique(y))
    if num_classes < 2:
        print(f"Skipping MEP {col} due to only one class present.")
        continue
    
    # Step 2: Split the data into training and testing sets
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 3: Choose and configure the model
    model = Pipeline([
        ('scaler', StandardScaler()), # You can apply additional preprocessing steps as needed
        ('classifier',svm.LinearSVC()) # Change the classifier as needed (e.g., DecisionTreeClassifier, RandomForestClassifier, SVC, MLPClassifier)
    ])
    
    # Step 4: Train the model
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    #importances = np.zeros(X.shape[1])
    
    #for train_idx, test_idx in cv.split(X, y):
     #   model.fit(X[train_idx], y[train_idx])
      #  importances += model.named_steps['classifier'].feature_importances_

   # importances /= cv.n_splits
    #feature_names = ['feature_' + str(i) for i in range(X.shape[1])]
    #feature_importances = dict(zip(feature_names, importances))

    accuracy_results[col] = {'accuracy': np.mean(accuracy_scores)}#, 'feature_importances': feature_importances}
# Convert the accuracy_results dictionary to a DataFrame
accuracy_df = pd.DataFrame.from_dict(accuracy_results, orient='index')

# Add the MEP column to the DataFrame
accuracy_df = accuracy_df.reset_index().rename(columns={'index': 'MEP'})

# Split the feature_importances column into separate columns for each feature
#accuracy_df = accuracy_df.join(pd.DataFrame(accuracy_df['feature_importances'].to_dict()).T)


In [None]:
accuracy_df.describe()

Unnamed: 0,accuracy
count,855.0
mean,0.718081
std,0.13548
min,0.37264
25%,0.604391
50%,0.741905
75%,0.831807
max,0.9971


In [None]:
# Get the minimum and maximum accuracy values
min_accuracy = accuracy_df['accuracy'].min()
max_accuracy = accuracy_df['accuracy'].max()
median_accuracy = accuracy_df['accuracy'].median()
mean_accuracy = accuracy_df['accuracy'].mean()

# Print the results
print(f"Minimum Accuracy: {min_accuracy}")
print(f"Maximum Accuracy: {max_accuracy}")
print(f"Median Accuracy: {median_accuracy}")
print(f"Mean Accuracy: {mean_accuracy}")
