<a href="https://colab.research.google.com/github/faithrts/COMP-551/blob/GaryBranch/A2_Draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set-up

In [1]:
### importing libraries and setting the random seed

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import re
import math
import bisect
from scipy.stats import zscore
from scipy.io import arff
from importlib import reload

from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_svmlight_file
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split as skl_train_test_split
from sklearn.metrics import mutual_info_score

#import warnings
#warnings.filterwarnings('ignore')

np.random.seed(1234)

# a folder to store the saved graphs
#!mkdir images

# Data Handling

## Importing

In [2]:
### importing the files from the web to google colab

# retrieving the IMDB data
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

# unzipping the tar.gz file into google colab for easy access
!tar -xf  'aclImdb_v1.tar.gz'

--2022-11-01 15:49:10--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2022-11-01 15:49:15 (18.6 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



## Preprocessing

### Helper functions

In [3]:
def filter_svmlight(matrix, vocab):
  # boolean matrix for all values not equal to 0
  X_boolean = matrix != 0  

  # 1 X 1 matrix with the percentage of documents that includes each word (per column)
  X_doc_percentage = X_boolean.astype(int).sum(axis = 0) / matrix.shape[0]

  ''' finding the stopwords and rarewords '''

  # 1 X 1 boolean matrix indicates whether each column (word) is a stopword
  X_bool_stopwords = X_doc_percentage > 0.5

  # 1 X 1 boolean matrix indicates whether each column (word) is a rareword
  X_bool_rarewords = X_doc_percentage < 0.01

  # boolean list for whether each index (word) is not a stopword
  not_stopwords = [not word for word in X_bool_stopwords.tolist()[0]]

  # boolean list for whether each index (word) is not a rare word
  not_rarewords = [not word for word in X_bool_rarewords.tolist()[0]]

  ''' finding the column indices of words that are not stopwords or rare words '''

  not_stopword_indices = [index for index, x in enumerate(np.transpose(not_stopwords)) if x]
  not_rareword_indices = [index for index, x in enumerate(np.transpose(not_rarewords)) if x]

  # the intsersection of the two lists above are the indices of words that are neither
  # stopwords nor rare words
  not_stop_or_rare_indices = [index for index in not_stopword_indices if index in not_rareword_indices]

  ''' filtering for the words that are neither stopwords nor rare words '''

  # filters the original matrix
  X_filtered = matrix[:, not_stop_or_rare_indices]

  # filters the list of terms
  vocab_filtered = [vocab[index] for index in not_stop_or_rare_indices]

  return X_filtered, vocab_filtered

In [4]:
class linear_regression:
  def __init__(self, add_bias = True):
    self.add_bias = add_bias

  def fit(self, x, y):
    # if the dimension of x is 1
    if x.ndim == 1:
      # adds an extra dimension 
      # e.g., [1, 2, 3] -> [[1], [2], [3]]
      x = x[:, None]

    # the number of features
    N = x.shape[0]

    if self.add_bias:
      # adds bias by adding a constant feature of value 1
      # e.g., [[1], [2], [3]] -> [[1, 1], [2, 1], [3, 1]]
      x = np.column_stack([x, np.ones(N)])

    # w is the least square difference (w0 and w1)
    self.w = np.linalg.lstsq(x, y)[0]

    return self

  def predict(self, x):
    # the number of features
    N = x.shape[0]

    if self.add_bias:
      # adds bias by adding a constant feature of value 1
      # e.g., [[1], [2], [3]] -> [[1, 1], [2, 1], [3, 1]]
      x = np.column_stack([x, np.ones(N)])

    # predict the y values where @ denotes matrix multiplication
    # y = Xw
    yh = x @ self.w

    return yh

In [5]:
def standardize_array(arr):
  standardized_array = []

  mean_val = arr.mean()
  standard_dev = arr.std()

  for i in arr:
    new_val = (i - mean_val) / standard_dev
    standardized_array.append(new_val)

  return standardized_array

In [6]:
def standardize_list(list):
  standardized_list = []

  mean_val = sum(list) / len(list)
  standard_dev = np.std(list)

  for i in list:
    new_val = (i - mean_val) / standard_dev
    standardized_list.append(new_val)

  return standardized_list

In [7]:
def compute_zscores(df):
  y = df.iloc[:, -1]
  y_stan = standardize_list(y.tolist())

  N = df.shape[0]

  z_scores = []

  for col in df:
    x_stan = standardize_list(df[col].tolist())
    col_z_score = (np.transpose(x_stan) @ y_stan) / math.sqrt(N)

    z_scores.append(col_z_score)

  return z_scores

### Loading and cleaning IMDB data

In [8]:
### loading the svm files into sparse matrices

# X is the sparse matrix, y are the labels
X_IMDB_train, y_IMDB_train = load_svmlight_file('aclImdb/train/labeledBow.feat', dtype=int)

# X is the sparse matrix, y are the labels
X_IMDB_test, y_IMDB_test = load_svmlight_file('aclImdb/test/labeledBow.feat', dtype=int)

# saving a list of the terms/vocab
IMDB_vocab = [line.rstrip() for line in open('aclImdb/imdb.vocab')]

In [9]:
### filtering the IMDB matrices to remove stop words and rare words

X_IMDB_train_filtered, IMDB_train_vocab_filtered = filter_svmlight(X_IMDB_train, IMDB_vocab)
#X_IMDB_test_filtered, IMDB_test_vocab_filtered = filter_svmlight(X_IMDB_test, IMDB_vocab)

In [10]:
### creates dataframes out of the words that are neither stopwords nor rare words

IMDB_train_df = pd.DataFrame(X_IMDB_train_filtered.toarray(), columns = IMDB_train_vocab_filtered)
#IMDB_test_df = pd.DataFrame(X_IMDB_test_filtered.toarray(), columns = IMDB_test_vocab_filtered)

# adds the target labels as a column
IMDB_train_df['LABEL'] = y_IMDB_train.astype(int)
#IMDB_test_df['LABEL'] = y_IMDB_test.astype(int)

In [11]:
IMDB_train_df

Unnamed: 0,he,his,!,by,an,who,they,from,so,like,...,portray,length,discovered,aware,continues,below,opens,essentially,received,LABEL
0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
1,0,0,1,0,0,0,0,1,0,3,...,0,0,0,0,0,0,0,0,0,7
2,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,9
3,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,10
4,0,0,0,1,1,2,0,1,0,1,...,0,0,0,0,0,0,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,5,0,9,3,1,7,3,2,1,0,...,0,0,0,0,0,1,0,0,0,1
24996,2,2,1,0,0,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,1
24997,0,2,0,2,1,2,3,1,0,3,...,0,0,0,0,0,0,1,0,0,4
24998,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


### Loading and cleaning Twenty News Groups data

In [12]:
### selecting 4 categories and extracting the data from sklearn

fav_four = ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.politics.guns']

# 20 news groups training
twenty_train = fetch_20newsgroups(subset='train', categories=fav_four, remove=(['headers', 'footers', 'quotes']))
# 20 news groups testing
twenty_test = fetch_20newsgroups(subset='test', categories=fav_four, remove=(['headers', 'footers', 'quotes']))

In [13]:
### transforming the data into vectors

# creating a new CountVectorizer object
count_vect = CountVectorizer(max_df=0.5, min_df=0.01)

# builds a dictionary of features and transforms documents to feature
# vectors where each index represents the occurrence of a specific word
X_train_counts = count_vect.fit_transform(twenty_train.data)

# retrieving the names of the features
feature_names = count_vect.get_feature_names_out()

# creating a dataframe in which row represents a document and each column
# a word
twenty_train_df = pd.DataFrame(X_train_counts.toarray(), columns = feature_names)

In [14]:
### adding label column

twenty_train_df['LABEL'] = twenty_train.target

# one-hot encoding
twenty_train_df['LABEL'] = twenty_train_df['LABEL'].replace({0:'[1,0,0,0]', 1:'[0,1,0,0]', 2:'[0,0,1,0]', 3:'[0,0,0,1]'})

### Determining important features for IMDB data


In [15]:
### computing z-score of each feature

# returns a list of z-scores where the value at index i is the z-score
# of the word at column i in the dataframe
z_scores = compute_zscores(IMDB_train_df)

# computes the absolute values of each z-score
abs_z_scores = list(map(abs, z_scores))

In [16]:
### determining the 100 most "important" words based on their z-scores

# the indices of the words with the greatest absolute z-scores
top_50_zscores = np.argsort(np.array(abs_z_scores))[0:50]

# the indices of words with the lowest absolute z-scores
bottom_50_zscores = np.argsort(np.array(abs_z_scores))[::-1][0:50]

# the words with the top 50 absolute z-scores
top_50_words = [IMDB_vocab[i] for i in top_50_zscores]

# the words with the bottom 50 absolute z-scores
bottom_50_words = [IMDB_vocab[i] for i in bottom_50_zscores]

In [17]:
### analyzing the top 50 words

top_50_words

['elements',
 'disturbing',
 'value',
 'battle',
 'content',
 'final',
 'etc',
 'legend',
 'runs',
 'watching',
 'stephen',
 'question',
 'de',
 'heaven',
 'park',
 'filming',
 'johnny',
 'british',
 'bloody',
 'ago',
 'expect',
 'possible',
 'giving',
 'track',
 'effort',
 'political',
 'describe',
 'comedic',
 'lord',
 'incredible',
 'powerful',
 'air',
 'direction',
 'annoying',
 'fast',
 'flicks',
 'delivers',
 'surprisingly',
 'apartment',
 'romance',
 'well',
 'somehow',
 'predictable',
 'fell',
 'dvd',
 'new',
 'kid',
 'coming',
 'kills',
 'van']

In [18]:
### analyzing the bottom 50 words

bottom_50_words

['pacing',
 'when',
 'saw',
 'dialogue',
 'up',
 'women',
 'not',
 'wrong',
 'enjoy',
 'job',
 'become',
 "don't",
 'all',
 'tries',
 'given',
 'year',
 'chance',
 'one',
 'off',
 'version',
 'most',
 'thought',
 'overall',
 'above',
 'because',
 'keep',
 'title',
 'came',
 'guess',
 'shown',
 'first',
 'for',
 'worst',
 'dance',
 'reason',
 'material',
 'japanese',
 'trouble',
 'story',
 'between',
 'child',
 'animation',
 'side',
 'many',
 'decide',
 'success',
 'very',
 'age',
 'here',
 'honest']

### Determining important features for Twenty News Groups data

In [131]:
# here gary
#The actual usefulness of a feature depends on the model you use it with. 
#A feature is only useful to the extent that its relationship with the target is one your model can learn. 
#Just because a feature has a high MI score doesn't mean your model will be able to do anything with that information. 
#You may need to transform the feature first to expose the association.

def make_MI_scores(Labels_true, Labels_pred):
  mi_scores = mutual_info_score(Labels_true, Labels_pred)
  return mi_scores

def show_MI_scores(mi_scores, class_label):
  show_scores = pd.Series(mi_scores, name = "Mutual Info Scores of " + str(class_label))
  show_scores = show_scores.sort_values(ascending=False)
  return show_scores

In [132]:
temp1 = twenty_train_df.copy()
temp2 = twenty_train_df.copy()
temp3 = twenty_train_df.copy()
temp4 = twenty_train_df.copy()

temp1=temp1.replace({'[1,0,0,0]':1,'[0,1,0,0]':0,'[0,0,1,0]':0,'[0,0,0,1]':0})
temp2=temp2.replace({'[1,0,0,0]':0,'[0,1,0,0]':1,'[0,0,1,0]':0,'[0,0,0,1]':0})
temp3=temp3.replace({'[1,0,0,0]':0,'[0,1,0,0]':0,'[0,0,1,0]':1,'[0,0,0,1]':0})
temp4=temp4.replace({'[1,0,0,0]':0,'[0,1,0,0]':0,'[0,0,1,0]':0,'[0,0,0,1]':1})

temp1 = temp1.iloc[:, -1]
temp2 = temp2.iloc[:, -1]
temp3 = temp3.iloc[:, -1]
temp4 = temp4.iloc[:, -1]

temp1 = temp1.to_numpy()
temp2 = temp2.to_numpy()
temp3 = temp3.to_numpy()
temp4 = temp4.to_numpy()

In [135]:
MI_scores_class1 = []
MI_scores_class2 = []
MI_scores_class3 = []
MI_scores_class4 = []

for col in twenty_train_df.iloc[:,:-1]:
  col_list = twenty_train_df[col].tolist()
  MI_scores_class1.append(make_MI_scores(temp1, col_list))
  MI_scores_class2.append(make_MI_scores(temp2, col_list))
  MI_scores_class3.append(make_MI_scores(temp3, col_list))
  MI_scores_class4.append(make_MI_scores(temp4, col_list))


#print(MI_scores)
print(show_MI_scores(MI_scores_class1, "class1"))
print(show_MI_scores(MI_scores_class2, "class2"))
print(show_MI_scores(MI_scores_class3, "class3"))
print(show_MI_scores(MI_scores_class4, "class4"))

594     0.055424
167     0.038482
1122    0.035078
216     0.031178
169     0.030452
          ...   
993     0.000014
1446    0.000007
1451    0.000006
649     0.000005
1238    0.000002
Name: Mutual Info Scores of class1, Length: 1519, dtype: float64
603     0.075720
1338    0.036490
984     0.035055
1347    0.034606
533     0.031776
          ...   
1169    0.000102
913     0.000052
211     0.000035
578     0.000017
932     0.000002
Name: Mutual Info Scores of class2, Length: 1519, dtype: float64
1253    0.110866
885     0.050271
952     0.047070
865     0.038567
752     0.038438
          ...   
611     0.000026
307     0.000020
1509    0.000012
1247    0.000009
976     0.000002
Name: Mutual Info Scores of class3, Length: 1519, dtype: float64
612     0.100516
613     0.065760
1458    0.053483
541     0.045264
539     0.037375
          ...   
502     0.000061
1247    0.000051
1083    0.000048
807     0.000025
993     0.000012
Name: Mutual Info Scores of class4, Length: 1519, dtype: 

In [None]:
twenty_train_df

In [None]:
Labels_true = twenty_train_df.data

mi_scores = make_MI_scores(Labels_true, Labels_pred)
mi_scores[::3]

# Implementing models

## Helper functions

## Logistic regression

## Multi-class regression

# Running experiments

### Helper functions

## Logistic regression experiments

### Baseline accuracy tests

## Multi-class regression experiments

### Baseline accuracy tests
