In [4]:
%matplotlib inline

from glob import glob
import json
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
import os.path as path
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.feature_extraction.text import CountVectorizer as count_vectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB as multinomial_nb
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import load_digits
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
import string
import time
import operator
from sklearn.linear_model import LogisticRegression
from wordcloud import WordCloud
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras.layers import Activation, Conv2D, MaxPooling2D, AveragePooling2D
from keras import backend as K
from matplotlib import colors as mcolors

In [5]:
def convert(x):
    # load json object
    ob = json.loads(x)
    
    # parse through objects in json and join as a csv or dict
    for k, v in ob.items():
        if isinstance(v, list):
            ob[k] = ','.join(v)
        elif isinstance(v, dict):
            for kk, vv in v.items():
                ob['%s_%s' % (k, kk)] = vv
            del ob[k]
    return ob

In [21]:
def text_process(text, weak_sentiment_word_list):
    #Instantiate word_list array that will contain a final list of the words in the input
    #text with the punctuation removed and all characters in lowercase.
    word_list = []
    
    #Parse characters of text and remove punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #Parse word by word in text convert to lowercase and remove stopwords and weak sentiment words
    for word in nopunc.split():
        word = word.lower()
        if word not in stopwords.words('english'):
            if word not in weak_sentiment_word_list:
                word_list.append(word.lower())
        
    return word_list

In [22]:
def normalize_dataset(tweet):
    # instantiate yelp_normalized array
    tweet_normalized = []
    
    # Create datasets separated by a normalized distribution of the number of likes
    tweet_1 = tweet[(tweet['likes'] == 1)]
    tweet_2 = tweet[(tweet['likes'] == 2)]
    tweet_3 = tweet[(tweet['likes'] == 3)]
    tweet_4 = tweet[(tweet['likes'] == 4)]
    tweet_5 = tweet[(tweet['likes'] == 5)]
    
    # determine the lowest count in datasets
    limiting_factor = min([len(tweet_1), len(tweet_2), len(tweet_3), len(tweet_4), len(tweet_5)])
        
    # concatenate all datasets into one dataset
    tweet_normalized.append(tweet_1.sample(limiting_factor))
    tweet_normalized.append(tweet_2.sample(limiting_factor))
    tweet_normalized.append(tweet_3.sample(limiting_factor))
    tweet_normalized.append(tweet_4.sample(limiting_factor))
    tweet_normalized.append(tweet_5.sample(limiting_factor))
    
    return pd.concat(tweet_normalized)

In [23]:
def generate_string(tweet, rating):
    # create dataset unique to a star rating and instantiate a string array
    tweet = tweet[(tweet['likes'] == rating)]
    string = []
    
    # parse tokenized text in each review
    for text in tweet['tokenized']:
        # parse tokens in tokenized text and append them to string array
        for token in text:
            string.append(token)
    return pd.Series(string).str.cat(sep=' ')

In [24]:
def generate_list(string_count, length):
    # instantiate word_list and set count to 0
    word_list = []
    count = 0
    
    # parse words in string_count 
    for word in string_count:
        # append the word while count is less than length
        if count < length:
            count += 1
            word_list.append(word[0])
        else:
            break
    
    return word_list

In [25]:
def generate_weak_sentiment_list(tweet):
    # instantiate weak_sentiment_word_list array
    weak_sentiment_word_list = []

    # generate a positive and negative string based on 1 and 5 stars
    tweet_negative_string = generate_string(tweet, 1)
    tweet_positive_string = generate_string(tweet, 5)

    # sort items in positive and negative arrays from greatest to least
    positive_string_count = sorted(word_count(tweet_positive_string).items(), 
                                   key=operator.itemgetter(1), 
                                   reverse = True)
    
    negative_string_count = sorted(word_count(tweet_negative_string).items(), 
                                   key=operator.itemgetter(1), 
                                   reverse = True)
    
    # arbitrarily set a length based on the length of both arrays
    length = int((len(positive_string_count) + len(negative_string_count)) * 0.001 / 2)
    
    # generate positive and negative word lists
    positive_word_list = generate_list(positive_string_count, length)
    negative_word_list = generate_list(negative_string_count, length)
    
    # parse words in the lists and if they match add them to the weak sentiment array
    for word in positive_word_list:
        if word in negative_word_list:
            weak_sentiment_word_list.append(word)
    return weak_sentiment_word_list

In [26]:
def convert_review_large_csv(override):
    # load json into memory if it does not exist or there is an override to overwrite the file
    if not path.exists('./dataset/review_large.csv') or override:
        for json_filename in glob('*.json'):
            # create csv of the same name as json
            csv_filename = '%s.csv' % json_filename[:-5]
            print('Converting %s to %s' % (json_filename, csv_filename))
            
            # parse lines of json in memory and add them to a dataframe and convert dataframe into a csv
            df = pd.DataFrame([convert(line) for line in open(json_filename)])
            df.to_csv(csv_filename, encoding='utf-8', index=False)
    else:
        print('review.csv already exists and no override detected')

In [64]:
def clean_dataset(tweet):
    #Instantiate a weak_sentiment_list array
    weak_sentiment_list = []
    
    #print(type(tweet))
    
    #Retype text to string 
    tweet['text'] = tweet['text'].astype(str)
    
    #Create length and tokenized columns 
    tweet['length'] = tweet['text'].apply(len)
    tweet['tokenized'] = tweet.apply(lambda row: text_process(row['text'], weak_sentiment_list), axis=1)
    
    #Generate a weak sentiment word list and apply it to the tokenized column
    weak_sentiment_list = generate_weak_sentiment_list(tweet)
    tweet['tokenized'] = tweet.apply(lambda row: text_process(row['text'], weak_sentiment_list), axis=1)
    
    return tweet, weak_sentiment_list

In [53]:
def create_class(tweet, boundary):
    # generate class of yelp based on stars included
    if not boundary:
        tweet_class = tweet
    else:
        tweet_class = tweet[(tweet['likes'] == 1) | (tweet['likes'] == 5)]
    
    return tweet_class

In [54]:
def generate_X_y(tweet_class):
    # instantiate X_list array
    X_list = []
    
    # create X and y and assign appropriate columns
    X = tweet_class['tokenized']
    y = tweet_class['likes']
    
    # parse items in column and append them to X_list array
    for item in X:
        X = ' '.join(item)
        X_list.append(X)
    
    return X_list, y

In [55]:
def bow_transformer(X):
    # vectorize words in X with an ngram of 1 and setting a feature ceiling at 450,000 
    # the feature ceiling was set to extend the boundary case without conflicts
    bow_transformer = count_vectorizer(ngram_range=(1, 2), max_features=450000).fit(X)
    
    # transform vectorize words
    X = bow_transformer.transform(X)
    
    return X

In [56]:
def word_count(str):
    #Instantiate a counts dictionary and split the words in str
    counts = dict()
    words = str.split()

    #Parse words in text and count the number of times that the word appears in the dictionary
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

In [57]:
def classifier_train(classifier, X_train, y_train, X_test, y_test):
    #Set time and fit classifier
    t0 = time.time()
    classifier.fit(X_train, y_train)
    
    #Set time and use the classifier to predict the test values  
    t1 = time.time()
    prediction = classifier.predict(X_test)
    
    #Set completion time
    t2 = time.time()
    
    #Determine training and prediction times
    time_train = t1-t0
    time_predict = t2-t1
    
    #Evaluate accuracy of the classifier based on the test case
    score = classifier.score(X_test, y_test)
    print('Score: {0}'.format(score))
    print('\n')
    
    # evaluate the confusion matrix based on the predictions generated by the classifier
    confusion_matrix = metrics.confusion_matrix(y_test, prediction)
    print('Confusion Matrix: \n {0}'.format(confusion_matrix))
    print('\n')
    
    print('Training time: {0:.3f}s; Prediction time: {1:.3f}s'.format(time_train, time_predict))
    print(classification_report(y_test, prediction))
    
    return classifier, prediction, time_train, time_predict, score, confusion_matrix

In [58]:
#Loading the dataset of tweets
tweets = pd.read_csv('./datasets/data_elonmusk.csv', encoding='unicode_escape')

In [59]:
# Loading dataset of TSLA stock prices
stocks = pd.read_csv('./datasets/TSLA.csv')

In [60]:
#sns.countplot(x='Time',data=tweets);

In [63]:
clean_dataset(tweets)

<class 'pandas.core.frame.DataFrame'>


KeyError: 'text'

In [47]:
retweets = tweets.groupby('Retweet from').mean()

  exec(code_obj, self.user_global_ns, self.user_ns)
