In [None]:
import sys, argparse
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import re
from collections import Counter
from collections import defaultdict

In [None]:
######################################################################
## Do not edit this block of code.
## This defines the dumb features the model starts with.
######################################################################


def dumb_featurize(text):
	feats = {}
	words = text.split(" ")

	for word in words:
		if word == "love" or word == "like" or word == "best":
			feats["contains_positive_word"] = 1
		if word == "hate" or word == "dislike" or word == "worst" or word == "awful":
			feats["contains_negative_word"] = 1

	return feats



In [None]:
######################################################################
## Do not edit this block of code.
## This defines the sentiment classification class which
## loads the data and sets up the model.
######################################################################

class SentimentClassifier:

	def __init__(self, feature_method):
		self.feature_vocab = {}
		self.feature_method = feature_method


	# Read data from file
	def load_data(self, filename):
		data = []
		with open(filename, encoding="utf8") as file:
			for line in file:
				cols = line.split("\t")
				label = cols[0]
				text = cols[1].rstrip()

				data.append((label, text))
		return data

	# Featurize entire dataset
	def featurize(self, data):
		featurized_data = []
		for label, text in data:
			feats = self.feature_method(text)
			featurized_data.append((label, feats))
		return featurized_data

	# Read dataset and returned featurized representation as sparse matrix + label array
	def process(self, dataFile, training = False):
		data = self.load_data(dataFile)
		data = self.featurize(data)

		if training:			
			fid = 0
			feature_doc_count = Counter()
			for label, feats in data:
				for feat in feats:
					feature_doc_count[feat]+= 1

			for feat in feature_doc_count:
				if feature_doc_count[feat] >= MIN_FEATURE_COUNT[self.feature_method.__name__]:
					self.feature_vocab[feat] = fid
					fid += 1

		F = len(self.feature_vocab)
		D = len(data)
		X = sparse.dok_matrix((D, F))
		Y = np.zeros(D)
		for idx, (label, feats) in enumerate(data):
			for feat in feats:
				if feat in self.feature_vocab:
					X[idx, self.feature_vocab[feat]] = feats[feat]
			Y[idx] = 1 if label == "pos" else 0

		return X, Y

	def load_test(self, dataFile):
		data = self.load_data(dataFile)
		data = self.featurize(data)

		F = len(self.feature_vocab)
		D = len(data)
		X = sparse.dok_matrix((D, F))
		Y = np.zeros(D, dtype = int)
		for idx, (data_id, feats) in enumerate(data):
			# print (data_id)
			for feat in feats:
				if feat in self.feature_vocab:
					X[idx, self.feature_vocab[feat]] = feats[feat]
			Y[idx] = data_id

		return X, Y

	# Train model and evaluate on held-out data
	def evaluate(self, trainX, trainY, devX, devY):
		(D,F) = trainX.shape
		self.log_reg = linear_model.LogisticRegression(C = L2_REGULARIZATION_STRENGTH[self.feature_method.__name__])	
		self.log_reg.fit(trainX, trainY)
		training_accuracy = self.log_reg.score(trainX, trainY)
		development_accuracy = self.log_reg.score(devX, devY)
		print("Method: %s, Features: %s, Train accuracy: %.3f, Dev accuracy: %.3f" % (self.feature_method.__name__, F, training_accuracy, development_accuracy))
		

	# Predict labels for new data
	def predict(self, testX, idsX):
		predX = self.log_reg.predict(testX)

		out = open("%s_%s" % (self.feature_method.__name__, "predictions.csv"), "w", encoding="utf8")
		out.write("Id,Expected\n")
		for idx, data_id in enumerate(testX):
			out.write("%s,%s\n" % (idsX[idx], int(predX[idx])))
		out.close()

	# Write learned parameters to file
	def printWeights(self):
		out = open("%s_%s" % (self.feature_method.__name__, "weights.txt"), "w", encoding="utf8")
		reverseVocab = [None]*len(self.feature_vocab)
		for feat in self.feature_vocab:
			reverseVocab[self.feature_vocab[feat]] = feat

		out.write("%.5f\t__BIAS__\n" % self.log_reg.intercept_)
		for (weight, feat) in sorted(zip(self.log_reg.coef_[0], reverseVocab)):
			out.write("%.5f\t%s\n" % (weight, feat))
		out.close()



In [None]:
######################################################################
##You may find it helpful to change these parameters to prevent the model from overfitting 
##and achieve higher performance
######################################################################

# regularization strength to control overfitting (values closer to 0  = stronger regularization)
L2_REGULARIZATION_STRENGTH = {"dumb_featurize": 1, "fancy_featurize": 0.1 }

# must observe feature at least this many times in training data to include in model
MIN_FEATURE_COUNT = {"dumb_featurize": 10,  "fancy_featurize": 5 }

In [None]:
# Implement your fancy featurization here
def fancy_featurize(text):
  features = {}
  # adds bag of word representation to features

  features.update(bag_of_words(text))
  # Your code goes here

  features.update(binary_bag(text)) 

  # features.update(bigram(text))

  # bag = bag_of_words(text)
  # features.update(simple(text, bag))

  # features.update(word_freq(text, bag))

  # features.update(rev_len(text))

  # features.update(trigram(text))
  features.update(fourgram(text))

  # features.update(embeddings_dict)

  return features

In [None]:
# !wget http://mpqa.cs.pitt.edu/lexicons/subj_lexicon

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-02-03 00:40:03--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-02-03 00:40:03--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-02-03 00:40:04--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [None]:
# !unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
# Adds the bag of words representation of the text to feats
def bag_of_words(text):
  word_bag = {}
	
	# Your code goes here
  words = text.lower().split(" ")
  for w in words:
    if w in word_bag:
      word_bag[w] += 1
    else:
      word_bag[w] = 1

  return word_bag

In [None]:
def binary_bag(text):
  word_bag = {}

  words = text.lower().split(" ")
  for w in words:
    if w not in word_bag:
      word_bag[w] = 1
  return word_bag

In [None]:
def bigram(text):
  word_bag = {}

  words = text.lower().split(" ")
  for w in range(0, len(words) - 1):
    new_word = words[w] + words[w + 1]

    if new_word in word_bag:
      word_bag[new_word] += 1
    else:
      word_bag[new_word] = 1

  return word_bag

In [None]:
def trigram(text):
  word_bag = {}

  words = text.lower().split(" ")
  for w in range(0, len(words) - 2):
    new_word = words[w] + words[w + 1] + words[w + 2]

    if new_word in word_bag:
      word_bag[new_word] += 1
    else:
      word_bag[new_word] = 1

  return word_bag

In [None]:
def fourgram(text):
  word_bag = {}

  words = text.lower().split(" ")
  for w in range(0, len(words) - 3):
    new_word = words[w] + words[w + 1] + words[w + 2] + words[w + 3]

    if new_word in word_bag:
      word_bag[new_word] += 1
    else:
      word_bag[new_word] = 1

  return word_bag

In [None]:
def word_freq(text, bag):
  n = len(text)

  words = text.lower().split(" ")
  for w in words:
      bag[w] = bag[w] / n
  return bag

In [None]:
def simple(text, word_bag):
  sentences = text.split(".")
  for s in sentences:
    for word in s.lower().split(" "):
      if word == "n't" or word == "not":
        word_bag[word] += len(s)

    return word_bag

In [None]:
def rev_len(text):
  length = {}

  revs = text.split("\n")

  for r in revs:
    length[r] = len(r)
  
  return length
  
  # words = text.lower().split(" ")

  # i = 0
  # if words[i] == "pos":
  #   while words[i] != "neg":
  #     i += 1
  #   length["review_length"] = i
  # elif words[i] == "neg":
  #   while words[i] != "pos":
  #     i += 1
  #   length["review_length"] = i

  return length

In [None]:
#This code gets the train/dev/test files from github and imports them into Colab
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/train.txt
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/dev.txt
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/test.txt.zip
!unzip test.txt.zip

--2020-02-04 02:20:00--  https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1427184 (1.4M) [text/plain]
Saving to: ‘train.txt’


2020-02-04 02:20:01 (8.46 MB/s) - ‘train.txt’ saved [1427184/1427184]

--2020-02-04 02:20:01--  https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/dev.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1474560 (1.4M) [text/plain]
Saving to: ‘dev.txt’


2020-02-04 02:20:01 (16.4 MB/s) - ‘dev.txt’ saved [1474560/1474560]

--2020-02-04 02:20

In [None]:
#This cell trains two models: one on the dumb features and one on your fancy
#features.  It will store the test set predictions in a csv.
#The weights will be stored in a text file. 
#To access the files, click on the folder icon in the left sidebar.
#You can preview the files in Colab by double clicking or download the files by 
#right clicking and selecting Download.
if __name__ == "__main__":
  trainingFile = "./train.txt"
  evaluationFile = "./dev.txt"
  testFile = "./test.txt"

  for feature_method in [dumb_featurize, fancy_featurize]:
    sentiment_classifier = SentimentClassifier(feature_method)
    trainX, trainY = sentiment_classifier.process(trainingFile, training=True)
    devX, devY = sentiment_classifier.process(evaluationFile, training=False)
    testX, idsX = sentiment_classifier.load_test(testFile)
    sentiment_classifier.evaluate(trainX, trainY, devX, devY)
    sentiment_classifier.printWeights()
    sentiment_classifier.predict(testX, idsX)


Method: dumb_featurize, Features: 2, Train accuracy: 0.602, Dev accuracy: 0.611
Method: fancy_featurize, Features: 7364, Train accuracy: 0.997, Dev accuracy: 0.831
