In [983]:
from TwitterAPI import TwitterAPI, TwitterOAuth, TwitterRequestError, TwitterConnectionError, TwitterPager
import pandas as pd
import os
import random
import matplotlib.pyplot as plt
%matplotlib inline  
import tensorflow as tf
import tensorflow_hub as hub
from textblob import TextBlob
import seaborn as sns
import plotly.express as px
import re
import time

In [984]:
#Keys
auth = TwitterOAuth.read_file()
api = TwitterAPI(auth.consumer_key, auth.consumer_secret, auth.access_token_key, auth.access_token_secret, api_version='2')

In [985]:
#function to collect tweets
def collect_tweets(data):
    #create infinity loop
    while 1:  
        #make API call
        response = TwitterPager(api, 'tweets/search/recent', {
        #excludes retweets and replies in search query and tweets with hashtags and only includes English tweets
        'query':'jk rowling trans lang:en -is:retweet -is:reply',
        #collects tweet data such as when the tweet was created and the contents of the tweet
        'tweet.fields':'created_at,text',
        #collects the type of tweet it is, (eg. quoted)
        'expansions':'referenced_tweets.id',
        #collects tweets posted until 6/1 3 p.m. UTC (6/2 12 a.m. KST)
        'end_time':'2022-06-05T15:00:00Z',
        #maximum results that can be collected in a page is 100
        'max_results':100})
        time.sleep(1)
        
        #collect tweets
        for items in response.get_iterator(new_tweets=False):
            #collects type of tweet. if there is no referenced_tweets dictionary, it means that it is an original tweet
            if 'referenced_tweets' in items:
                tweet_type = items['referenced_tweets'][0]['type']
            else:
                tweet_type = "original_tweet"
        
            
            #collect tweet ID, time posted, and the content of the tweet
            tweet_id = items['id']
            time_created = items['created_at']
            time_created = str(time_created).split("T")[0]
            tweet_text = items['text']
            tweet_text = re.sub('@[^\s]+','',tweet_text)
            tweet_text = re.sub(r'http\S+','',tweet_text)
            tweet_text = re.sub('&amp;','',tweet_text)
            
            #append tweet details to dataframe
            tweet_data = {'tweet id': tweet_id, 'created at': time_created, 
                              'tweet type': tweet_type, 'contents':tweet_text}
            data = pd.concat([data, pd.DataFrame([tweet_data.values()], columns=data.columns)], ignore_index=True)
            data.to_csv(r'C:\Users\dania\Documents\경희대학교\웹 파이선프로그래밍\Term Project\term-project.csv')

        return data

In [986]:
#building the pandas dataframe
df = pd.DataFrame(columns=["tweet id", "created at", "tweet type", "contents"])
df = collect_tweets(df)

In [987]:
df.head()

Unnamed: 0,tweet id,created at,tweet type,contents
0,1533444025042079744,2022-06-05,original_tweet,BBC says it's 'misleading' to call JK Rowling'...
1,1533437278260969473,2022-06-05,quoted,I have 3 children and the books movies of JK ...
2,1533423561339047936,2022-06-05,original_tweet,In one trillion years there will be no more JK...
3,1533390036183715843,2022-06-05,original_tweet,"This is what supports, people. The eradicatio..."
4,1533385663612977152,2022-06-05,original_tweet,"Happy #pride babes ❤️🧡💛💚💙💜\n\nStay hydrated, k..."


In [988]:
def sentiment_analysis(tweet):
    def getSubjectivity(text):
        return TextBlob(text).sentiment.subjectivity

 #Create a function to get the polarity
    def getPolarity(text):
        return TextBlob(text).sentiment.polarity
    df['subjectivity'] = df['contents'].apply(getSubjectivity)
    df['polarity'] = df['contents'].apply(getPolarity)

    def getAnalysis(score):
          if score < 0:
            return 'Negative'
          else:
            return 'Positive'

    df['analysis'] = df['polarity'].apply(getAnalysis)
    df.to_csv(r'C:\Users\dania\Documents\경희대학교\웹 파이선프로그래밍\Term Project\term-project.csv')
    return tweet

In [989]:
sentiment_analysis(df)

Unnamed: 0,tweet id,created at,tweet type,contents,subjectivity,polarity,analysis
0,1533444025042079744,2022-06-05,original_tweet,BBC says it's 'misleading' to call JK Rowling'...,0.000000,0.000000,Positive
1,1533437278260969473,2022-06-05,quoted,I have 3 children and the books movies of JK ...,0.100000,0.000000,Positive
2,1533423561339047936,2022-06-05,original_tweet,In one trillion years there will be no more JK...,0.500000,-0.250000,Negative
3,1533390036183715843,2022-06-05,original_tweet,"This is what supports, people. The eradicatio...",0.000000,0.000000,Positive
4,1533385663612977152,2022-06-05,original_tweet,"Happy #pride babes ❤️🧡💛💚💙💜\n\nStay hydrated, k...",0.640000,0.120000,Positive
...,...,...,...,...,...,...,...
205,1531303224501755910,2022-05-30,original_tweet,if you google the prison that's in that articl...,0.494444,0.200000,Positive
206,1531298457239052290,2022-05-30,original_tweet,jk rowling concern trolling re: trans women in...,0.527778,0.080556,Positive
207,1531295538171195392,2022-05-30,quoted,Or a black trans artist! (Do you know one?) I ...,0.416667,-0.054167,Negative
208,1531293269740732418,2022-05-30,original_tweet,"Jk rowling calling being trans a ""luxury belie...",0.503333,0.257121,Positive


In [990]:
#reads csv file containing dataframe
labeled_tweets = pd.read_csv(r'C:\Users\dania\Documents\경희대학교\웹 파이선프로그래밍\Term Project\term-project.csv')
#includes only the relevant columns
labeled_tweets = labeled_tweets[['contents', 'polarity']]
labeled_tweets.head()

Unnamed: 0,contents,polarity
0,BBC says it's 'misleading' to call JK Rowling'...,0.0
1,I have 3 children and the books movies of JK ...,0.0
2,In one trillion years there will be no more JK...,-0.25
3,"This is what supports, people. The eradicatio...",0.0
4,"Happy #pride babes ❤️🧡💛💚💙💜\n\nStay hydrated, k...",0.12


In [991]:
#creates train dataframe with 80% of the data, and a test set with the remaining 20%
train = labeled_tweets.sample(frac=0.8)
test = labeled_tweets.drop(train.index)

#turns columns into numpy arrays to use as inputs for training
train_examples, train_labels = train['contents'].values, train['polarity'].values
test_examples, test_labels = test['contents'].values, test['polarity'].values

In [992]:
#sets up a dictionary for tensorflow hub downloads to go to
os.environ['TFHUB_CACHE_DIR'] = r'C:\Users\dania\Documents\경희대학교\웹 파이선프로그래밍\Term Project\Tensorflow\modules'

#gets tokenization model from tensorflow hub to convert tweet content into numerical values
model = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
hub_layer = hub.KerasLayer(model, input_shape=[], dtype=tf.string, trainable=True)

#tokenization of an abstract to binary
hub_layer(train_examples[0:1])

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[ 4.17549431e-01, -7.03618536e-03,  2.14448139e-01,
        -7.10551068e-02, -3.40042599e-02, -9.79365595e-03,
         3.27943563e-02,  4.19967733e-02, -1.51335537e-01,
         8.41522068e-02, -1.60442237e-02, -3.11894000e-01,
         5.28027397e-03,  2.17520706e-02, -6.67694956e-02,
         9.97089297e-02,  2.25942090e-01, -1.11131310e-01,
        -1.44412994e-01,  3.82436424e-01,  7.84914047e-02,
        -7.52940997e-02,  5.58948740e-02,  7.78195634e-02,
        -1.72048599e-01, -1.44294456e-01,  4.27274480e-02,
         6.96867704e-02, -3.04792792e-01,  5.77635653e-02,
        -3.43274092e-03,  9.62619260e-02,  2.17905939e-02,
         1.02503225e-01,  1.20593682e-02,  7.59878010e-02,
        -3.93416509e-02, -5.93775734e-02,  7.16062784e-02,
         2.48345375e-01, -4.08394150e-02,  3.23396511e-02,
        -1.23106316e-02, -1.45973071e-01,  1.75744802e-01,
         1.96779773e-01, -4.01058383e-02, -8.05485025e-02,
      

In [993]:
#creates a Sequential neural network
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential_43"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_41 (KerasLayer)  (None, 128)              124642688 
                                                                 
 dense_126 (Dense)           (None, 16)                2064      
                                                                 
 dense_127 (Dense)           (None, 8)                 136       
                                                                 
 dense_128 (Dense)           (None, 1)                 9         
                                                                 
Total params: 124,644,897
Trainable params: 124,644,897
Non-trainable params: 0
_________________________________________________________________


In [994]:
# compiles model
# designed for binary classification
model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.5, name='accuracy')])

In [995]:
# sets aside some of the example datapoints for validation during training
x_val = train_examples[:70]
partial_x_train = train_examples[70:]

y_val = train_labels[:70]
partial_y_train = train_labels[70:]

print(len(x_val), len(partial_x_train))

70 98


In [996]:
#train the model
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=10,
                    batch_size=140,
                    validation_data=(x_val, y_val),
                    verbose=1)

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [997]:
# evaluate performance on test set
results = model.evaluate(test_examples, test_labels)
print(results)

[0.6526638269424438, 0.4047619104385376]


In [998]:
# visualize performance with a confusion matrix
predictions = (model.predict(test_examples) > 0.5).astype("int32")
predictions = [p[0] for p in predictions]
confusion_matrix = tf.math.confusion_matrix(test_labels, predictions)

ax = plt.axes()
sns.heatmap(confusion_matrix, cmap='flare', annot = True, fmt = 'd', ax = ax)
ax.set_title('Tensorflow NLP Model: Confusion Matrix')
ax.set_xlabel('Predicted Label')
ax.set_ylabel('True Label')
plt.show()



InvalidArgumentError: `labels` contains negative values.  
Condition x >= 0 did not hold element-wise:
x (shape=(42,) dtype=int64) = 
['0', '0', '0', '...']

In [None]:
# gets TextBlob analysis
def get_sentiment_2(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0 and random.random() > 0.5:
        return 1
    else: 
        return 0

In [None]:
# compares TextBlob with TensorFlow
correct = 0
incorrect = 0

for i in range(len(test_examples)):
    if test_labels[i] == get_sentiment_2(test_examples[i]):
        correct +=1
    else:
        incorrect +=1
        
print('Accuracy: ', correct/(correct+incorrect))

In [None]:
# gets TensorFlow analysis results
def tensor_sentiment_calc(text):
    tensor_flow = (model.predict(['contents'])> 0.5).astype("int32")[0][0]
    if tensor_flow==1:
        return 'positive'
    else:
        return 'negative'
        
df['textblob'] = df['contents'].apply(sentiment_calc)
df['tensor'] = df['contents'].apply(tensor_sentiment_calc)

In [None]:
df.to_csv(r'C:\Users\dania\Documents\경희대학교\웹 파이선프로그래밍\Term Project\term-project-final.csv')

In [None]:
df.head()