<a href="https://colab.research.google.com/github/faithrts/COMP-551/blob/main/A2_Draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set-up

In [3]:
### importing libraries and setting the random seed

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import re
import math
import bisect
from scipy.stats import zscore
from scipy.io import arff
from importlib import reload

from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_svmlight_file
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split as skl_train_test_split

#import warnings
#warnings.filterwarnings('ignore')

np.random.seed(1234)

# a folder to store the saved graphs
#!mkdir images

# Data Handling

## Importing

In [2]:
### importing the files from the web to google colab

# retrieving the IMDB data
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

# unzipping the tar.gz file into google colab for easy access
!tar -xf  'aclImdb_v1.tar.gz'

--2022-10-30 14:30:27--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2022-10-30 14:30:29 (49.5 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



## Preprocessing

### Helper functions

In [98]:
def filter_svmlight(matrix, vocab):
  # boolean matrix for all values not equal to 0
  X_boolean = matrix != 0  

  # 1 X 1 matrix with the percentage of documents that includes each word (per column)
  X_doc_percentage = X_boolean.astype(int).sum(axis = 0) / matrix.shape[0]

  ''' finding the stopwords and rarewords '''

  # 1 X 1 boolean matrix indicates whether each column (word) is a stopword
  X_bool_stopwords = X_doc_percentage > 0.5

  # 1 X 1 boolean matrix indicates whether each column (word) is a rareword
  X_bool_rarewords = X_doc_percentage < 0.01

  # boolean list for whether each index (word) is not a stopword
  not_stopwords = [not word for word in X_bool_stopwords.tolist()[0]]

  # boolean list for whether each index (word) is not a rare word
  not_rarewords = [not word for word in X_bool_rarewords.tolist()[0]]

  ''' finding the column indices of words that are not stopwords or rare words '''

  not_stopword_indices = [index for index, x in enumerate(np.transpose(not_stopwords)) if x]
  not_rareword_indices = [index for index, x in enumerate(np.transpose(not_rarewords)) if x]

  # the intsersection of the two lists above are the indices of words that are neither
  # stopwords nor rare words
  not_stop_or_rare_indices = [index for index in not_stopword_indices if index in not_rareword_indices]

  ''' filtering for the words that are neither stopwords nor rare words '''

  # filters the original matrix
  X_filtered = matrix[:, not_stop_or_rare_indices]

  # filters the list of terms
  vocab_filtered = [vocab[index] for index in not_stop_or_rare_indices]

  return X_filtered, vocab_filtered

In [99]:
class linear_regression:
  def __init__(self, add_bias = True):
    self.add_bias = add_bias

  def fit(self, x, y):
    # if the dimension of x is 1
    if x.ndim == 1:
      # adds an extra dimension 
      # e.g., [1, 2, 3] -> [[1], [2], [3]]
      x = x[:, None]

    # the number of features
    N = x.shape[0]

    if self.add_bias:
      # adds bias by adding a constant feature of value 1
      # e.g., [[1], [2], [3]] -> [[1, 1], [2, 1], [3, 1]]
      x = np.column_stack([x, np.ones(N)])

    # w is the least square difference (w0 and w1)
    self.w = np.linalg.lstsq(x, y)[0]

    return self

  def predict(self, x):
    # the number of features
    N = x.shape[0]

    if self.add_bias:
      # adds bias by adding a constant feature of value 1
      # e.g., [[1], [2], [3]] -> [[1, 1], [2, 1], [3, 1]]
      x = np.column_stack([x, np.ones(N)])

    # predict the y values where @ denotes matrix multiplication
    # y = Xw
    yh = x @ self.w

    return yh

In [100]:
def standardize_array(arr):
  standardized_array = []

  mean_val = arr.mean()
  standard_dev = arr.std()

  for i in arr:
    new_val = (i - mean_val) / standard_dev
    standardized_array.append(new_val)

  return standardized_array

In [101]:
def standardize_list(list):
  standardized_list = []

  mean_val = sum(list) / len(list)
  standard_dev = np.std(list)

  for i in list:
    new_val = (i - mean_val) / standard_dev
    standardized_list.append(new_val)

  return standardized_list

In [108]:
def compute_zscores(df):
  y = df.iloc[:, -1]
  y_stan = standardize_list(y.tolist())

  N = df.shape[0]

  z_scores = []

  for col in IMDB_train_df.iloc[:, :-1]:
    x_stan = standardize_list(df[col].tolist())
    col_z_score = (np.transpose(x_stan) @ y_stan) / math.sqrt(N)

    z_scores.append(col_z_score)

  return z_scores

### Loading and cleaning IMDB data

In [129]:
### loading the svm files into sparse matrices

# X is the sparse matrix, y are the labels
X_IMDB_train, y_IMDB_train = load_svmlight_file('aclImdb/train/labeledBow.feat', dtype=int)

# X is the sparse matrix, y are the labels
X_IMDB_test, y_IMDB_test = load_svmlight_file('aclImdb/test/labeledBow.feat', dtype=int)

# saving a list of the terms/vocab
IMDB_vocab = [line.rstrip() for line in open('aclImdb/imdb.vocab')]

In [130]:
### filtering the IMDB matrices to remove stop words and rare words

X_IMDB_train_filtered, IMDB_train_vocab_filtered = filter_svmlight(X_IMDB_train, IMDB_vocab)
X_IMDB_test_filtered, IMDB_test_vocab_filtered = filter_svmlight(X_IMDB_test, IMDB_vocab)

In [149]:
### creates dataframes out of the words that are neither stopwords nor rare words

IMDB_train_df = pd.DataFrame(X_IMDB_train_filtered.toarray(), columns = IMDB_train_vocab_filtered)
IMDB_test_df = pd.DataFrame(X_IMDB_test_filtered.toarray(), columns = IMDB_test_vocab_filtered)

# adds the target labels as a column
IMDB_train_df['LABEL'] = y_IMDB_train.astype(int)
IMDB_test_df['LABEL'] = y_IMDB_test.astype(int)

# dataframes that preserve the movie review values
IMDB_train_zscore_df = IMDB_train_df.copy()
IMDB_test_zscore_df = IMDB_test_df.copy()

# dataframes that label negative reviews as 0 and positive reviews as 1
IMDB_train_df['LABEL'] = IMDB_train_df['LABEL'].replace({0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:1, 7:1, 8:1, 9:1, 10:1})
IMDB_test_df['LABEL'] = IMDB_test_df['LABEL'].replace({0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:1, 7:1, 8:1, 9:1, 10:1})

In [151]:
IMDB_train_df

Unnamed: 0,he,his,!,by,an,who,they,from,so,like,...,portray,length,discovered,aware,continues,below,opens,essentially,received,LABEL
0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,1,0,3,...,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,1,2,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,5,0,9,3,1,7,3,2,1,0,...,0,0,0,0,0,1,0,0,0,0
24996,2,2,1,0,0,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,0
24997,0,2,0,2,1,2,3,1,0,3,...,0,0,0,0,0,0,1,0,0,0
24998,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [150]:
IMDB_test_df

Unnamed: 0,he,his,!,by,an,who,they,from,so,like,...,pair,thomas,protagonist,tape,range,generation,ryan,post,provided,LABEL
0,1,0,0,2,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4,3,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,1,1,1,1,0,0,0,5,6,2,...,0,0,0,0,0,0,0,0,0,1
4,0,2,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,4,3,0,3,1,2,0,2,0,3,...,0,0,0,0,0,0,0,0,0,0
24996,1,1,0,2,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
24997,0,3,0,4,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24998,2,0,2,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Loading and cleaning Twenty News Groups data

In [23]:
### selecting 4 categories and extracting the data from sklearn

fav_four = ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.politics.guns']

# 20 news groups training
twenty_train = fetch_20newsgroups(subset='train', categories=fav_four, remove=(['headers', 'footers', 'quotes']))
# 20 news groups testing
twenty_test = fetch_20newsgroups(subset='test', categories=fav_four, remove=(['headers', 'footers', 'quotes']))

In [24]:
### transforming the data into vectors

# creating new CountVectorizer objects
count_vect = CountVectorizer(max_df=0.5, min_df=0.01)
count_vect_test = CountVectorizer()

# builds a dictionary of features and transforms documents to feature
# vectors where each index represents the occurrence of a specific word
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect_test.fit_transform(twenty_test.data)

# retrieving the names of the features
train_feature_names = count_vect.get_feature_names_out()
test_feature_names = count_vect_test.get_feature_names_out()

# creating dataframes in which each row represents a document and each column
# a word
twenty_train_df = pd.DataFrame(X_train_counts.toarray(), columns = train_feature_names)
twenty_test_df = pd.DataFrame(X_test_counts.toarray(), columns = test_feature_names)

In [25]:
### adding label column

twenty_train_df['LABEL'] = twenty_train.target
twenty_test_df['LABEL'] = twenty_test.target

# one-hot encoding
twenty_train_df['LABEL'] = twenty_train_df['LABEL'].replace({0:'[1,0,0,0]', 1:'[0,1,0,0]', 2:'[0,0,1,0]', 3:'[0,0,0,1]'})
twenty_test_df['LABEL'] = twenty_test_df['LABEL'].replace({0:'[1,0,0,0]', 1:'[0,1,0,0]', 2:'[0,0,1,0]', 3:'[0,0,0,1]'})

In [26]:
twenty_train_df

Unnamed: 0,00,000,01,04,10,100,1000,11,12,13,...,year,years,yes,yesterday,yet,york,you,your,yourself,LABEL
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,2,0,0,"[1,0,0,0]"
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,8,3,0,"[0,0,0,1]"
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,"[0,0,0,1]"
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,1,0,0]"
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,0,1,0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,0,1,0]"
2199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,"[0,1,0,0]"
2200,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,"[1,0,0,0]"
2201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,"[1,0,0,0]"


In [27]:
twenty_test_df

Unnamed: 0,00,000,0000,00000,000005102000,00041555,0004244402,00043819,00044808,00044939,...,zubrin,zug,zur,zurich,zvezdny,zvi,zwork,zyda,zyxel,LABEL
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,0,1,0]"
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1,0,0,0]"
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,1,0,0]"
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,1,0,0]"
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,0,0,1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,0,1,0]"
1462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,0,1,0]"
1463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,1,0,0]"
1464,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0,1,0,0]"


### Determining important features for IMDB data


In [134]:
### computing z-score of each feature

# returns a list of z-scores where the value at index i is the z-score
# of the word at column i in the dataframe
z_scores = compute_zscores(IMDB_train_df)

# computes the absolute values of each z-score
abs_z_scores = list(map(abs, z_scores))

In [135]:
### determining the 100 most "important" words based on their z-scores

# the indices of the words with the greatest absolute z-scores
top_50_zscores = np.argsort(np.array(abs_z_scores))[0:50]

# the indices of words with the lowest absolute z-scores
bottom_50_zscores = np.argsort(np.array(abs_z_scores))[::-1][0:50]

# the words with the top 50 absolute z-scores
top_50_words = [IMDB_train_vocab_filtered[i] for i in top_50_zscores]

# the words with the bottom 50 absolute z-scores
bottom_50_words = [IMDB_train_vocab_filtered[i] for i in bottom_50_zscores]

In [136]:
### analyzing the top 50 words

top_50_words

['agree',
 'state',
 'motion',
 'ready',
 'longer',
 'camp',
 'sad',
 'bar',
 'appearance',
 'come',
 'ms',
 'acted',
 'common',
 'meaning',
 'followed',
 'sight',
 'constant',
 'general',
 'next',
 'clear',
 'hear',
 'forget',
 'mad',
 'think',
 'alive',
 'likes',
 'leaving',
 'pain',
 'jason',
 'beat',
 'extreme',
 'nearly',
 'can',
 'ending',
 'night',
 'hands',
 'talents',
 'across',
 'under',
 'church',
 'gone',
 'character',
 'previous',
 'notice',
 'dr',
 'accident',
 'things',
 'location',
 'rate',
 'blue']

In [137]:
### analyzing the bottom 50 words

bottom_50_words

['bad',
 'worst',
 'great',
 'waste',
 'awful',
 '?',
 'excellent',
 'no',
 'wonderful',
 'worse',
 'terrible',
 'boring',
 'best',
 'stupid',
 'nothing',
 'horrible',
 'poor',
 'minutes',
 'crap',
 'even',
 'just',
 'supposed',
 'love',
 'perfect',
 'poorly',
 'acting',
 'ridiculous',
 'plot',
 'beautiful',
 'lame',
 'amazing',
 'script',
 'loved',
 'favorite',
 'pointless',
 'avoid',
 "don't",
 'why',
 'superb',
 'highly',
 'annoying',
 'brilliant',
 'also',
 'mess',
 'dull',
 'wasted',
 'money',
 'very',
 'any',
 'fantastic']

In [159]:
### filtering the IMDB dataframes to only include words from the top 100 found

# making copies of the original dataframe
IMDB_train_original_df = IMDB_train_df.copy()
IMDB_test_original_df = IMDB_test_df.copy()

# making lists of the words to use as column names
train_new_words = top_50_words + bottom_50_words + ['LABEL']
test_new_words = [word for word in train_new_words if word in IMDB_test_original_df.columns.tolist()]

# filtering the IMDB dataframe to only include words from the top 100 found
IMDB_train_df = IMDB_train_df[train_new_words]
IMDB_test_df = IMDB_test_df[test_new_words]

# removing rows with all zeros from the dataframes
IMDB_train_df = IMDB_train_df.loc[(IMDB_train_df != 0).any(axis = 1)]
IMDB_test_df = IMDB_test_df.loc[(IMDB_test_df != 0).any(axis = 1)]

In [160]:
IMDB_train_df

Unnamed: 0,agree,state,motion,ready,longer,camp,sad,bar,appearance,come,...,brilliant,also,mess,dull,wasted,money,very,any,fantastic,LABEL
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,2,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
24997,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
24998,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
IMDB_test_df

Unnamed: 0,agree,state,motion,ready,longer,camp,sad,bar,appearance,come,...,brilliant,also,mess,dull,wasted,money,very,any,fantastic,LABEL
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,0,1
2,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,2,1,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,2,2,1,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0


### Determining important features for Twenty News Groups data

In [None]:
# here gary

# Implementing models

## Helper functions

## Logistic regression

## Multi-class regression

# Running experiments

### Helper functions

## Logistic regression experiments

### Baseline accuracy tests

## Multi-class regression experiments

### Baseline accuracy tests
