# Logistic Regression with uni, bi, and tri gram embeddings

## imports

In [55]:
import string
from os import listdir
from os.path import isfile, join
import nltk
from nltk import word_tokenize
from nltk.corpus import opinion_lexicon
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer


# Libraries

import matplotlib.pyplot as plt

import pandas as pd
# Preliminaries
from sklearn.model_selection import train_test_split

# Models

# Training

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('opinion_lexicon')
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### configs

### setup puncuation and contractions to help clean text

In [56]:
punctuation = string.punctuation

contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

### functions to help get data from txt files and to process and clean data

In [4]:
def get_file_key(fileName):
    # get the file num
    return int(fileName.split("_")[1].split(".")[0]) 
    
def get_files_from_dir(directory):
    filesInDir = [f for f in listdir(directory) if isfile(join(directory, f))]
    return sorted(filesInDir, key = get_file_key)   
        
def process_string_sentence(text):
        englishStopwords = stopwords.words("english")  # non-neccesary words
        text = text.lower()  # case folding
        # remove punctuation
        text = "".join([char for char in text if char not in punctuation])
        words = word_tokenize(text)
        removed = [word for word in words if word not in englishStopwords]
        stemmed = [PorterStemmer().stem(word) for word in removed]
        stemmed_sentence = " ".join(stemmed)
        return stemmed_sentence

def process_string(text):
        englishStopwords = stopwords.words("english")  # non-neccesary words
        text = text.lower()  # case folding
        # remove punctuation
        text = "".join([char for char in text if char not in punctuation])
        words = word_tokenize(text)
        removed = [word for word in words if word not in englishStopwords]
        return [", ".join(removed)]


def tokenize_files(files, dir):
        cleaned_positive_files = []
        for file in files:
            file_path = str.format("{}/{}", dir, file)
            with open(file_path) as f:
                raw_text = f.read()
                cleaned_positive_files.append(process_string(raw_text))
        return cleaned_positive_files

def is_word_positive(word):
        if word in positive_dict or word in positive_dict_stemmed:
            return True
        return False

def is_word_negative(word):
    if word in negative_dict or word in negative_dict_stemmed:
        return True
    return False



def get_word_occurrences(tokenized_files):
        word_occurrences = {}
        word_occurrences["positive"] = 0
        word_occurrences["negative"] = 0
        total_num_words = 0
        for file in tokenized_files:
            # calc number exclams
            # calc number pos/neg/words
            for word in file:
                if is_word_positive(word):
                    word_occurrences["positive"] += 1
                if is_word_negative(word):
                    word_occurrences["negative"] += 1
                if word not in word_occurrences:
                    word_occurrences[word] = 0
                word_occurrences[word] += 1
                total_num_words += 1
        return word_occurrences, total_num_words

def get_raw_text_from_files(files: list, dir: str) -> list:
    raw_text = []
    for file in files:
        file_path = str.format("{}/{}", dir, file)
        with open(file_path) as f:
            file_text_in_lines = f.read()
            raw_text.append(file_text_in_lines)
    return raw_text

def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

    # Convert words to lower case
    text = text.lower()

    # Replace contractions with their longer forms
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)

    return text


### import and process the data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [57]:
# Import the csv into pandas dataframe and add the headers
df = pd.read_csv('/content/drive/MyDrive/stack_nlp_large.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus,titletext,closed_reason_label
0,7,23,08/01/2008 12:09:41,48,08/01/2008 13:25:15,1,0,Latest information on PHP upcoming releases,I'm trying to track the progress of PHP 5.3 an...,php,,,,,05/18/2012 11:12:42,not constructive,Latest information on PHP upcoming releases. I...,4
1,30,126,08/01/2008 16:10:30,58,08/01/2008 13:56:33,11,1,How would you access Object properties from wi...,"What is the ""purist"" or ""correct"" way to acces...",oo,java,php,theory,,05/08/2012 18:11:27,not constructive,How would you access Object properties from wi...,4
2,31,129,08/01/2008 16:22:42,48,08/01/2008 13:25:15,11,1,How to export data from SQL Server to MySQL,I've been banging my head against SQL Server 2...,csv,ansi,sql,php,mssql,07/03/2012 14:30:16,off topic,How to export data from SQL Server to MySQL. I...,3
3,37,173,08/01/2008 18:33:08,83,08/01/2008 16:31:56,16,4,How do I version my MS SQL database in SVN?,I've been wanting to get my databases under ve...,subversion,svn,sql,mssql,versioncontrol,06/29/2012 15:08:28,not constructive,How do I version my MS SQL database in SVN?. I...,4
4,41,177,08/01/2008 18:37:55,83,08/01/2008 16:31:56,16,4,How do I programmatically create a PDF in my ....,Please recommend a good library for programmat...,pdf,.net,,,,04/25/2012 11:32:29,not constructive,How do I programmatically create a PDF in my ....,4


In [58]:
# # Converting the codes to appropriate categories using a dictionary
def convertReasonToLabel(reason):
    mapper = {"not a real question": 0, "too localized": 1, "off topic": 2, 'not constructive': 3, "open": 4 }
    return mapper.get(reason)

df['category'] = df['OpenStatus'].apply(convertReasonToLabel)

# remove all open 
df = df[df.category != 4]

# df.head()

In [59]:
dataSample = df.sample(n = 5000)

In [60]:

dataSample['titletext'] = dataSample['Title'] + ". " + dataSample['BodyMarkdown']

### gather and clean data 

In [61]:

dataSample['Titles_Cleaned'] = list(map(clean_text, dataSample["Title"]))
dataSample['Bodies_Cleaned'] = list(map(clean_text, dataSample["BodyMarkdown"]))
dataSample['Title_Text_Cleaned'] = list(map(clean_text, dataSample["titletext"]))


### function to run logistic regression model with the given bow converter and data

In [62]:
import sklearn

def run_logisitc(allData, allLabels, description,  bow_converter, _C=1.0,):
    kf = KFold(n_splits=2, shuffle=True)
    model = None
    for trainingIndex, testingIndex in kf.split(allData):
        # get the train/test labels and data from split
        trainingData, testingData = allData[trainingIndex], allData[testingIndex]
        trainingLabels, testingLabels = allLabels[trainingIndex], allLabels[testingIndex]

        # convert the data to bow
        trainingData = bow_converter.fit_transform(trainingData)
        testingData = bow_converter.transform(testingData)

        # create the model
        model = LogisticRegression(multi_class='ovr', solver='liblinear').fit(trainingData, trainingLabels)

        # get the model score
        score = model.score(testingData, testingLabels)
        print(description, " model score: ", score)

        # get label prediction
        labelPrediction = model.predict(testingData)
        # print('Predicted value is =', lm.predict([X_test[200]]))
        print("f1 score (micro): ", sklearn.metrics.f1_score(testingLabels, labelPrediction, average='micro'))
        print("f1 score (macro): ", sklearn.metrics.f1_score(testingLabels, labelPrediction, average='macro'))

    return model



In [63]:
# turn into numpy array 
allLabels = np.array(dataSample['category'])
allLabels.size

5000

In [64]:
# turn into numpy array 
titlesAndText = np.array(dataSample['Title_Text_Cleaned'])
titlesAndText.size

5000

In [71]:
bigram_bow_converter = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False) 
model = run_logisitc(titlesAndText, allLabels, "bigram BOW", bigram_bow_converter)



bigram BOW  model score:  0.508
f1 score (micro):  0.508
f1 score (macro):  0.3866771510754241
bigram BOW  model score:  0.5036
f1 score (micro):  0.5036
f1 score (macro):  0.3778661438516365


