# Assignment `L07`
##### Daryn White

In [None]:
import re
import pandas
import numpy
import sklearn
import nltk

1. use pandas read_csv with sep='\t' to read in the following 2 files available from the us naval academy: <span style="color:red" float:right>[1 point]</span>
   - url = 'https://www.usna.edu/Users/cs/nchamber/data/twitter/keyword-tweets.txt'
   - url = 'https://www.usna.edu/Users/cs/nchamber/data/twitter/general-tweets.txt'



In [None]:
key_tweets = pandas.read_csv('https://www.usna.edu/Users/cs/nchamber/data/twitter/keyword-tweets.txt', sep='\t', header=0, names=["sent","tweet"])
gen_tweets = pandas.read_csv('https://www.usna.edu/Users/cs/nchamber/data/twitter/general-tweets.txt', sep='\t', header=0, names=["sent","tweet"])

2. concatenate these 2 data sets into a single data frame called LabeledTweets that has 2 columns, named Sentiment and Tweet <span style="color:red" float:right>[1 point]</span>

In [None]:
LabeledTweets = pandas.concat([key_tweets,gen_tweets])
LabeledTweets.rename(columns={'sent':"Sentiment",'tweet':"Tweet"},inplace=True)
LabeledTweets

3. replace sentiment labels `{'POLIT': 1, 'NOT': 0}` <span style="color:red" float:right>[0 point]</span>

In [None]:
LabeledTweets.loc[LabeledTweets["Sentiment"] == "POLIT", "Sentiment"] = 1
LabeledTweets.loc[LabeledTweets["Sentiment"] == "NOT", "Sentiment"] = 0
LabeledTweets['Sentiment'] = LabeledTweets['Sentiment'].astype('int',copy=False)
LabeledTweets

4. clean the tweets <span style="color:red" float:right>[9 points]</span>
   1. remove all tokens that contain a "@". Remove the whole token, not just the character.
   2. remove all tokens that contain "http". Remove the whole token, not just the characters.
   3. **replace** (not remove) all punctuation marks with a space (" ")
   4. **replace** all numbers with a space
   5. **replace** all non ascii characters with a space
   7. convert all characters to lowercase
   8. strip extra whitespaces
   9. lemmatize tokens
   9. No need to remove stopwords because TfidfVectorizer will take care of that


In [None]:
def cleaning_tweets(text, steps):
    for step in steps:
        if step == 'remove_handles':
            text = re.sub(r"@\w{1,}","",text)
        elif step == 'remove_links':
            text = re.sub(r"[htps]{4,5}\:\/\/[.\/\-\w]{1,}\.[a-z]{2,3}","",text)
        elif step == 'repl_punct':
            text = re.sub(r"[!\"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~]{1,}"," ",text)
        elif step == 'repl_numb':
            text = re.sub(r"\d{1,}"," ", text)
        elif step == 'repl_nonascii':
            text = str().join([c for c in text if ord(c) < 128])
        elif step == 'lower_all':
            text = text.lower()
        elif step == 'strip_whitespace':
            text = " ".join([w for w in text.split()])
        elif step == 'lemmatize':
            text = " ".join([nltk.stem.WordNetLemmatizer().lemmatize(w) for w in text.split()])
    return text

In [None]:
steps = ['remove_handles','remove_links','repl_punct','repl_numb','repl_nonascii','lower_all','strip_whitespace','lemmatize']
LabeledTweets["Cleaned_Tweet"] = LabeledTweets["Tweet"].map(lambda s: cleaning_tweets(s, steps))
LabeledTweets

5. Use `TfidfVectorizer` from `sklearn` to prepare the data for machine learning. Use max_features = 50.  <span style="color:red" float:right>[2 point]</span>

In [None]:
def tfidf_vectorize(data, max_feat = 50):
    if not isinstance(data,pandas.Series):
        raise TypeError("Data need to be in a Series format")
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_features=max_feat, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data)
    tfidf_df = pandas.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    
    return tfidf_matrix, tfidf_df

In [None]:
fifty_feats, fifty_feats_df = tfidf_vectorize(LabeledTweets["Cleaned_Tweet"])
print(fifty_feats_df.shape[0])
fifty_feats_df

6. Use `sklearn` `LogisticRegression` to train a model on the results on 75% of the data. <span style="color:red" float:right>[1 point]</span>

In [None]:
def split_data(matrix,array):
    return sklearn.model_selection.train_test_split(matrix,array,test_size=round(array.size*0.25))

def logistic_regression(x_train,y_train):
    return sklearn.linear_model.LogisticRegression().fit(x_train,y_train)

In [None]:
x_trn, x_tst, y_trn, y_tst = split_data(fifty_feats,LabeledTweets['Sentiment'].values)

In [None]:
lr_fifty = logistic_regression(x_trn,y_trn)

7. determine the accuracy on the training data and the test data.   Determine the baseline accuracy. <span style="color:red" float:right>[1 point]</span>

In [None]:
print(f"""
Training accuracy: {numpy.mean(y_trn == lr_fifty.predict(x_trn))}
Testing accuracy: {numpy.mean(y_tst == lr_fifty.predict(x_tst))}
Baseline accuracy: {numpy.max([numpy.mean(y_tst == 1),numpy.mean(y_tst == 0)])}
""")

8. Repeat steps 5, 6, and 7  with TfidfVectorizer max_features set to 5, 500, 5000, 50000 and discuss your accuracies. <span style="color:red" float:right>[2 point]</span>

> 5 features

In [None]:
five_feats, five_feats_df = tfidf_vectorize(LabeledTweets["Cleaned_Tweet"], max_feat=5)
xtrain,xtest,ytrain,ytest = split_data(five_feats,LabeledTweets.Sentiment.values)
lr_five = logistic_regression(xtrain,ytrain)
print(f"""
Training accuracy: {numpy.mean(ytrain == lr_five.predict(xtrain))}
Testing accuracy: {numpy.mean(ytest == lr_five.predict(xtest))}
Baseline accuracy: {numpy.max([numpy.mean(ytest == 1),numpy.mean(ytest == 0)])}
""")

> 500 features

In [None]:
five_hund_feats, five_hund_feats_df = tfidf_vectorize(LabeledTweets["Cleaned_Tweet"], max_feat=500)
xtrain,xtest,ytrain,ytest = split_data(five_hund_feats,LabeledTweets.Sentiment.values)
lr_five_hund = logistic_regression(xtrain,ytrain)
print(f"""
Training accuracy: {numpy.mean(ytrain == lr_five_hund.predict(xtrain))}
Testing accuracy: {numpy.mean(ytest == lr_five_hund.predict(xtest))}
Baseline accuracy: {numpy.max([numpy.mean(ytest == 1),numpy.mean(ytest == 0)])}
""")

> 5000 features

In [None]:
five_thou_feats, five_thou_feats_df = tfidf_vectorize(LabeledTweets["Cleaned_Tweet"], max_feat=5000)
xtrain,xtest,ytrain,ytest = split_data(five_thou_feats,LabeledTweets.Sentiment.values)
lr_five_thou = logistic_regression(xtrain,ytrain)
print(f"""
Training accuracy: {numpy.mean(ytrain == lr_five_thou.predict(xtrain))}
Testing accuracy: {numpy.mean(ytest == lr_five_thou.predict(xtest))}
Baseline accuracy: {numpy.max([numpy.mean(ytest == 1),numpy.mean(ytest == 0)])}
""")

> 50,000 features

In [None]:
fifty_thou_feats, fifty_thou_feats_df = tfidf_vectorize(LabeledTweets["Cleaned_Tweet"], max_feat=50000)
xtrain,xtest,ytrain,ytest = split_data(fifty_thou_feats,LabeledTweets.Sentiment.values)
lr_fifty_thou = logistic_regression(xtrain,ytrain)
print(f"""
Training accuracy: {numpy.mean(ytrain == lr_fifty_thou.predict(xtrain))}
Testing accuracy: {numpy.mean(ytest == lr_fifty_thou.predict(xtest))}
Baseline accuracy: {numpy.max([numpy.mean(ytest == 1),numpy.mean(ytest == 0)])}
""")

#### Discussion

> Based on these tests above we seem to have peak accuracy at 500 features, with the increases in magnitude after only decreasing the accuracy. 
>
> I believe this is a simplistic example of over-fitting an algorithm. It's possible that we'll find slightly increased accuracy if we move up or down around 500 features in smaller increments, which would be an example of algorithm optimization. Of course, this is a relatively small dataset to work with but proves the point that over-fit is something easy to do if one doesn't pay close attention.