In [1]:
%pip install git+https://github.com/tweepy/tweepy.git

Collecting git+https://github.com/tweepy/tweepy.git
  Cloning https://github.com/tweepy/tweepy.git to /tmp/pip-req-build-e2bdlzwf
  Running command git clone --filter=blob:none --quiet https://github.com/tweepy/tweepy.git /tmp/pip-req-build-e2bdlzwf
  Resolved https://github.com/tweepy/tweepy.git to commit c7471ffc85e9d924e9f804d045aef9c6e0e2f45c
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting oauthlib<4,>=3.2.0 (from tweepy==4.14.0)
  Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.7/151.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting requests-oauthlib<2,>=1.2.0 (from tweepy==4.14.0)
  Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Building wheels for collected packages: tweepy
  Building wheel for tweepy (setup.py) ... [?25ldone
[?25h  Created wheel for tweepy: filename=tweepy-4.14.0-py3-none-any.whl size=98396 sha256=bffad670d75320d62da1eaf2bf19c35

In [None]:
# Import Packages
import tweepy
import pandas as pd

In [None]:
# Connect to an API

# consumer_key = "..." #Your API/Consumer key 
# consumer_secret = "..." #Your API/Consumer Secret Key
# access_token = "..."    #Your Access token key
# access_token_secret = "..." #Your Access token Secret key
# bearer_token = "..."

#Pass in our twitter API authentication key
auth = tweepy.OAuth1UserHandler(
    consumer_key, consumer_secret,
    access_token, access_token_secret
)

#Instantiate the tweepy API
api = tweepy.API(auth, wait_on_rate_limit=True)

search_query = "'ref''world cup'-filter:retweets AND -filter:replies AND -filter:links"
no_of_tweets = 100

try:
    #The number of tweets we want to retrieved from the search
    tweets = api.search_tweets(q=search_query, lang="en", count=no_of_tweets, tweet_mode ='extended')
    
    #Pulling Some attributes from the tweet
    attributes_container = [[tweet.user.name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text] for tweet in tweets]

    #Creation of co;lumn list to rename the columns in the dataframe
    columns = ["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
    
    #Creation of Dataframe
    tweets_df = pd.DataFrame(attributes_container, columns=columns)
except BaseException as e:
    print('Status Failed On,',str(e))

In [None]:
#Conduct ETL/EDA on a url of tweets after turning it into a DF

import pandas as pd
import numpy as np
import string
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from nltk.stem.wordnet import WordNetLemmatizer

import nltk

In [None]:
nltk.download('wordnet')

In [None]:
# 1. Use pandas read_csv with sep='\t' to read in the following 2 files available from the us naval academy

# url file:
url1 = 'https://www.usna.edu/Users/cs/nchamber/data/twitter/keyword-tweets.txt'
col_names = ['Sentiment', 'Tweet']

# Download the data
usn1 = pd.read_csv(url1, sep='\t', names = col_names)
usn1.head()

In [None]:
# url file
url2 = 'https://www.usna.edu/Users/cs/nchamber/data/twitter/general-tweets.txt'
col_names = ['Sentiment', 'Tweet']

# Download the data
usn2 = pd.read_csv(url2, sep='\t', names = col_names)
usn2.head()

**Reason: Use pandas read_csv to upload the data using the provided websites for further analysis.**

**Conclusion: Successfully uploaded data from the two provided websites and displayed a portion of data from each website.**

In [None]:
# 2. Concatenate these 2 data sets into a single data frame called LabeledTweets that has 2 columns, named Sentiment and Tweet

# Concatenate the two DataFrames
LabeledTweets = pd.concat([usn1, usn2], axis=0)

# Rename the columns
LabeledTweets.columns = ['Sentiment', 'Tweet']

**Reason: To concatenate the two provided data sets into a single data frame and call it LabeledTweets. Then ensure LabeledTweets has 2 columns named Sentiment and Tweet.**

**Conclusion: Successfully concatenated the two dataframes and named the columns Sentiment and Tweet.**

In [None]:
# 3. Replace sentiment labels 'POLIT': 1, 'NOT': 0

# Replace sentiment labels
LabeledTweets['Sentiment'] = LabeledTweets['Sentiment'].replace({'POLIT': 1, 'NOT': 0})

# Print the first few rows of the DataFrame
# print(LabeledTweets)
LabeledTweets.head()

**Reason: To replace the sentiment labels Polit with 1 and Not with 0.**

**Conclusion: Successfully created a binary column for Sentiment instead of strings Polit and Not. This will help with further analysis.**

In [None]:
# 4. Clean the tweets by doing the following:

# Removing all tokens that contain a "@". Remove the whole token, not just the character.
# Removing all tokens that contain "http". Remove the whole token, not just the characters.
# Replacing (not remove) all punctuation marks with a space (" ")
# Replacing all numbers with a space
# Replacing all non ascii characters with a space
# Converting all characters to lowercase
# Striping the extra whitespaces
# Lemmatizing tokens
# and remembering not to remove stopwords because TfidfVectorizer will do that

In [None]:
def clean(text, list_of_steps):
    
    for step in list_of_steps:
        # step 1 remove entire tokens starting with ampersand
        if step == 'remove_amp':
            text = ' '.join([x for x in text.split() if not x.startswith("@")])
        # step 2 remove entire tokens starting with http    
        elif step == 'remove_http':
            text = ' '.join([x for x in text.split() if not x.startswith("http")])
        # step 3 replace punctuation with space     
        elif step == 'replace_punctuation':
            punct_exclude = set(string.punctuation)
            for char in text:
                if char in punct_exclude:
                    text = text.replace(char, ' ')
        # step 4 replace numbers    
        elif step == 'replace_numbers':
            for char in text:
                try:
                    if char.isdigit():
                        text = text.replace(char, ' ')
                except:
                    pass
        # step 5 replace non ascii characters with space    
        elif step == 'replace_non_ascii':
            for char in text:
                if ord(char) >= 128:
                    text = text.replace(char, ' ')
        # step 6 turn all text to lowercase    
        elif step == 'lower_case':
            text = text.lower()
        # step 7 strip the white space    
        elif step == 'strip_whitespace':
            text = ' '.join(text.split())
        # step 8 lemmatizze the words into their stems    
        elif step == 'lemmatize':
            lmtzr = WordNetLemmatizer()
            word_list = text.split(' ')
            stemmed_words = [lmtzr.lemmatize(word) for word in word_list]
            text = ' '.join(stemmed_words)
    # finally return the processed text        
    return text

# Outline the steps
step_list = ['remove_amp', 'remove_http', 'replace_punctuation', 'replace_numbers',
            'replace_non_ascii', 'lower_case', 'strip_whitespace', 'lemmatize']

In [None]:
# Create a test string
test_string = "@cbigscat // can **12 http//www asoccen''t snwbrd &dggo li,on from aè  LIONSGATE" 

**Created a test string to test the steps prior to applying the function to the entire dataframe**

In [None]:
# test on test string
clean_text = clean(test_string, step_list)
clean_text

**Testing the steps was a success.**

In [None]:
# apply the function on df by using the map function linked with the lambda function
LabeledTweets['clean_tweet'] = LabeledTweets['Tweet'].map(lambda s: clean(s, step_list))

# review dataframe
LabeledTweets

In [None]:
LabeledTweets.dtypes

**Checking data types**

**Reason:  To clean the tweets for legability.**

**Conclusion: Successfully cleaned all the tweets using multiple methods making the tweets more legable compared to what they were originally.**

In [None]:
# 5. Use TfidfVectorizer from sklearn to prepare the data for machine learning. Use max_features = 50; 

In [None]:
# create instance of TfidVectorizer and apply it to the clean tweets column
clean_texts = LabeledTweets['clean_tweet']

# create a tfidVectorizer instance
vectorizer = TfidfVectorizer(max_features = 50)

# fit and transform our clean texts to a matrix
tfidf_matrix =  vectorizer.fit_transform(clean_texts)

# extract the column names
feature_names = vectorizer.get_feature_names_out()

# change the original matrix to a dense array
tfidf_matrix_dense = tfidf_matrix.toarray()

# create and review dataframe
tfidf_df = pd.DataFrame(tfidf_matrix_dense, columns = feature_names)

tfidf_df

**Reason: To use TfidfVectorizer from sklearn to prepare the data for machine learning with the usage of max_features = 50.**

**Conclusion: All rows are still here prior to fitting which indicates that the data preservation. Successfully converted the text data into a format that can be used by machine learning algorithms.**

In [None]:
# 6. Use sklearn LogisticRegression to train a model on the results on 75% of the data.

# 7. Determine the accuracy on the training data and the test data. Determine the baseline accuracy.

# 8. Repeat steps 5, 6, and 7 with TfidfVectorizer max_features set to 5, 500, 5000, 50000 and discuss your accuracies.

In [None]:
# create targets, features are the tfidf_df
y_targets = LabeledTweets['Sentiment']

# train test split the data into 75, 25 split
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y_targets, train_size=0.75, random_state=42)

**The TF-IDF dataframe already has the method needed for analysis so I utilize the sentiment values as our targets.**

In [None]:
# review shape of training data
print(type(X_train), X_train.shape)

**3,003 is 75% of the original 4,004.**

In [None]:
# create logistic regression instance
lr = LogisticRegression(random_state=42)

# fit model to training data
lr.fit(X_train, y_train)

# predict results for both training and testing data
train_results = lr.predict(X_train)
test_results = lr.predict(X_test)

In [None]:
# determine accuracy for testing and training data and baseline accuracy
train_acc = np.mean(y_train == train_results)
test_acc = np.mean(y_test == test_results)

print('Train accuracy: {}'.format(train_acc))
print('Test accuracy: {}'.format(test_acc))
print('Baseline accuracy: {}'.format(np.max([np.mean(y_test == 1), np.mean(y_test == 0)])))

**I set the first model's max_feature parameter to 50. It produces a pretty good predictive model. The test accuracy is at 76% which portrays that it is not overfitting the data and is still giving a higher accuracy.**

In [None]:
# Use sklearn.metrics to determine accuracy
print(classification_report(y_test, test_results))

**With max features set at 50, predictive ability for our algorithm is 80%. Max features at 50 gives enough information for predicting sentiment.**

In [None]:
# create a running list of dictionaries to hold the feature size and accuracy
accuracy_dict = [{'feature_size': 50, 'accuracy': test_acc.round(3)}]
accuracy_dict

**Max features set to: 5**

In [None]:
# create instance of TfidVectorizer and apply it to the clean tweets column
clean_texts = LabeledTweets['clean_tweet']

vectorizer = TfidfVectorizer(max_features = 5)
tfidf_matrix =  vectorizer.fit_transform(clean_texts)
# doc = 0
feature_names = vectorizer.get_feature_names_out()

tfidf_matrix_dense = tfidf_matrix.toarray()

# create and review dataframe
tfidf_df = pd.DataFrame(tfidf_matrix_dense, columns = feature_names)

tfidf_df

**This dataframe appears too simplistic to be of use for a sentiment prediction. The words presented are practically all conjunctions or prepositions.**

In [None]:
# create targets, features are the tfidf_df
y_targets = LabeledTweets['Sentiment']

# train test split the data into 75, 25 split
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y_targets, train_size=0.75, random_state=42)

In [None]:
# review shape of training data
print(type(X_train), X_train.shape)

In [None]:
# create logistic regression instance
lr = LogisticRegression(random_state=42)

# fit model to training data
lr.fit(X_train, y_train)

# predict results for both training and testing data
train_results = lr.predict(X_train)
test_results = lr.predict(X_test)

In [None]:
# determine accuracy for testing and training data and baseline accuracy
train_acc = np.mean(y_train == train_results)
test_acc = np.mean(y_test == test_results)

print('Train accuracy: {}'.format(train_acc))
print('Test accuracy: {}'.format(test_acc))
print('Baseline accuracy: {}'.format(np.max([np.mean(y_test == 1), np.mean(y_test == 0)])))

**We only have 5 features which creates a pretty useless model that is correct around 60% of the time.**

In [None]:
# Use sklearn.metrics to determine accuracy
print(classification_report(y_test, test_results))

In [None]:
# continue to add to our running list of dictionaries
accuracy_dict.append({'feature_size': 5, 'accuracy': test_acc.round(3)})
accuracy_dict

**Max features set to: 500**

In [None]:
# create instance of TfidVectorizer and apply it to the clean tweets column
clean_texts = LabeledTweets['clean_tweet']

vectorizer = TfidfVectorizer(max_features = 500)
tfidf_matrix =  vectorizer.fit_transform(clean_texts)
# doc = 0
feature_names = vectorizer.get_feature_names_out()

tfidf_matrix_dense = tfidf_matrix.toarray()

# create and review dataframe
tfidf_df = pd.DataFrame(tfidf_matrix_dense, columns = feature_names)

tfidf_df

**This data frame appears to be reasonable. Words displayed are relevent and the number of features are deep enough to add value to our predictions.**

In [None]:
# create targets, features are the tfidf_df
y_targets = LabeledTweets['Sentiment']

# train test split the data into 75, 25 split
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y_targets, train_size=0.75, random_state=42)

In [None]:
# review shape of training data
print(type(X_train), X_train.shape)

In [None]:
# create logistic regression instance
lr = LogisticRegression(random_state=42)

# fit model to training data
lr.fit(X_train, y_train)

# predict results for both training and testing data
train_results = lr.predict(X_train)
test_results = lr.predict(X_test)

In [None]:
# determine accuracy for testing and training data and baseline accuracy
train_acc = np.mean(y_train == train_results)
test_acc = np.mean(y_test == test_results)

print('Train accuracy: {}'.format(train_acc))
print('Test accuracy: {}'.format(test_acc))
print('Baseline accuracy: {}'.format(np.max([np.mean(y_test == 1), np.mean(y_test == 0)])))

**Max features set to 500  produces a higher accuracy than max features set to 50. I believe we are approaching the overfitting real**

In [None]:
# Use sklearn.metrics to determine accuracy
print(classification_report(y_test, test_results))

In [None]:
accuracy_dict.append({'feature_size': 500, 'accuracy':test_acc.round(3)})
accuracy_dict

**Max features set to: 5000**

In [None]:
# create instance of TfidVectorizer and apply it to the clean tweets column
clean_texts = LabeledTweets['clean_tweet']

vectorizer = TfidfVectorizer(max_features = 5000)
tfidf_matrix =  vectorizer.fit_transform(clean_texts)
# doc = 0
feature_names = vectorizer.get_feature_names_out()

tfidf_matrix_dense = tfidf_matrix.toarray()

# create and review dataframe
tfidf_df = pd.DataFrame(tfidf_matrix_dense, columns = feature_names)

tfidf_df

In [None]:
# create targets, features are the tfidf_df
y_targets = LabeledTweets['Sentiment']

# train test split the data into 75, 25 split
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y_targets, train_size=0.75, random_state=42)

In [None]:
# review shape of training data
print(type(X_train), X_train.shape)

In [None]:
# create logistic regression instance
lr = LogisticRegression(random_state=42)

# fit model to training data
lr.fit(X_train, y_train)

# predict results for both training and testing data
train_results = lr.predict(X_train)
test_results = lr.predict(X_test)

In [None]:
# determine accuracy for testing and training data and baseline accuracy
train_acc = np.mean(y_train == train_results)
test_acc = np.mean(y_test == test_results)

print('Train accuracy: {}'.format(train_acc))
print('Test accuracy: {}'.format(test_acc))
print('Baseline accuracy: {}'.format(np.max([np.mean(y_test == 1), np.mean(y_test == 0)])))

**Max features set to 5000 seem to drop slightly compared to max features set at 500. Both were still above 80%.**

In [None]:
# Use sklearn.metrics to determine accuracy
print(classification_report(y_test, test_results))

In [None]:
accuracy_dict.append({'feature_size': 5000, 'accuracy':test_acc.round(3)})
accuracy_dict

**Max features set to: 50000**

In [None]:
# create instance of TfidVectorizer and apply it to the clean tweets column
clean_texts = LabeledTweets['clean_tweet']

vectorizer = TfidfVectorizer(max_features = 50000)
tfidf_matrix =  vectorizer.fit_transform(clean_texts)
# doc = 0
feature_names = vectorizer.get_feature_names_out()

tfidf_matrix_dense = tfidf_matrix.toarray()

# create and review dataframe
tfidf_df = pd.DataFrame(tfidf_matrix_dense, columns = feature_names)

tfidf_df

In [None]:
# create targets, features are the tfidf_df
y_targets = LabeledTweets['Sentiment']

# train test split the data into 75, 25 split
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y_targets, train_size=0.75, random_state=42)

In [None]:
# review shape of training data
print(type(X_train), X_train.shape)

In [None]:
# create logistic regression instance
lr = LogisticRegression(random_state=42)

# fit model to training data
lr.fit(X_train, y_train)

# predict results for both training and testing data
train_results = lr.predict(X_train)
test_results = lr.predict(X_test)

In [None]:
# determine accuracy for testing and training data and baseline accuracy
train_acc = np.mean(y_train == train_results)
test_acc = np.mean(y_test == test_results)

print('Train accuracy: {}'.format(train_acc))
print('Test accuracy: {}'.format(test_acc))
print('Baseline accuracy: {}'.format(np.max([np.mean(y_test == 1), np.mean(y_test == 0)])))

**Performance drops a little more with the max features set to 50000. Training accuracy is overfitting the data at this point. For our predictive model, I think we want to use lower max features. The higher number of features starts overfitting with accuracies getting above 90%.**

In [None]:
# Use sklearn.metrics to determine accuracy
print(classification_report(y_test, test_results))

In [None]:
# append new values to dictionary
accuracy_dict.append({'feature_size': 50000, 'accuracy':test_acc.round(3)})
accuracy_dict

In [None]:
# create dataframe of accuracy and feature size
accuracy_df = pd.DataFrame(accuracy_dict)
accuracy_df = accuracy_df.set_index('feature_size')
accuracy_df.columns = ['TestAccuracy']

# display df
accuracy_df = accuracy_df.sort_index(ascending=True)

accuracy_df

In [None]:
# Plot on a graph, but take the log of the feature size and convert TestAccuracy to percent
x = np.log(list(accuracy_df.index))
y = np.array(accuracy_df['TestAccuracy'])*100

plt.axes(title='Test Accuracy vs Feature Size', xlabel='Log(Feature Size)', ylabel='Percentage (%)')
sns.lineplot(x=x, y=y)

**Accuracy begins to trend as feature size increases. Predictive test accuracy significantly increases with feature size. Accuracy peaks and then drops slightly as Log(Feature Size) increases.**

**Reason: To use sklearn LogisticRegression to train a model on the results on 75% of the data, to determine the accuracy on the training data and the test data. Determine the baseline accuracy, and to repeat steps 5, 6, and 7 with TfidfVectorizer max_features set to 5, 500, 5000, 50000 and discuss your accuracies.**

**Conclusion: Conclusions are spread throughout the analysis. See above.** 