In [1]:

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [2]:
## First we will read the .csv file of tweets into a pandas dataframe. 
## Note that this 

tweets = pd.read_csv('covid19_tweets.csv')


In [3]:

## If you comment out the commands below, you can see the format of the dataframe. 

## The first column is an index, then you have user_name, user_location, user_description,
## user_created, user_followers, user_friends, user_favourites, user_verified (True/False),
## date, text, hashtags (presented as a list of strings), 
## source (Twitter Web App, for android, etc), and is_retweet (True/False)



##print(len(tweets))
##print(tweets.head(30))


In [4]:
## Here is an example of the sort of simple string commands we will use to 
## determine likely political leaning. Uncomment the line below to see if the 
## author of the first tweet in the dataframe has the word conservative in their 
## user description. 

##'conservative' in tweets.loc[0]['user_description'].lower()

In [5]:
## Our first goal is to extract a subset of the tweet texts, labeled according to political
## leaning. It's hard to extract someone's political stance just from the information
## available in this table, but we will make a guess that a user_description that
## uses a lot of right-wing adjectives (conservative, traditional, etc.) is probably
## conservative and a user_description that uses a lot of left-wing adjectives (liberal,
## progressive, etc.) is probably liberal. Of course, many people leave their descriptions blank
## or don't include such terms; we will simply omit their posts. 

## The first thing we do is define a function that will extract the political leaning
## of a tweet's author. We build it out of somewhat simpler functions: 
## first a function that checks if a word is in a user_description, then a function
## that checks if any of a list of words appears in a user_description,
## and finally a function that checks to see if 

def word_in_desc(word,desc):
    return word in str(desc)

def some_word_in_desc(lst,desc):
    state = False
    for word in lst:
        if word_in_desc(word,desc):
            state = True
            break
    return state

def political_leaning(desc, 
                      liberal_words=['feminist','ImWithHer','liberal','progressive','democratic','democrat','leftist'],
                      conservative_words=['MAGA','patriot','conservative','traditional','libertarian','republican']):
    if some_word_in_desc(liberal_words,desc) and not some_word_in_desc(conservative_words,desc):
        return "L"
    elif some_word_in_desc(conservative_words,desc) and not some_word_in_desc(liberal_words,desc):
        return "R"
    else:
        return "I"

        


In [6]:
## We should check that this works at least somewhat reasonably. 

tweets['political_leaning']=tweets.user_description.apply(lambda x: political_leaning(x))

print(len(tweets))
liberal_tweets = tweets[tweets.political_leaning == 'L']
print(len(liberal_tweets))
conservative_tweets = tweets[tweets.political_leaning == 'R']
print(len(conservative_tweets))
partisan_tweets = tweets[tweets.political_leaning != 'I']
print(len(partisan_tweets))
## Somewhat unfortunately we have a pretty small set of tweets to work with after 
## filtering down to the partisan tweets. Also the conservative tweets outnumber the 
## liberal tweets. However it seems like we should be able to find _some_ interesting
## patterns even with just a couple thousand tweets. 

166656
1080
1499
2579


**We want to decide if political leaning has an effect on sentiment. We need to build a 
classifier. We will use the NLTK module, which is a popular tool for analyzing 
"natural language" documents such as tweets. ** 

In [7]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

from nltk.tag import pos_tag
from nltk.corpus import twitter_samples

from nltk.tokenize import TweetTokenizer



## First we are going to use some standard training tweets to create a model. 
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

tknz = TweetTokenizer()



In [8]:
## tweet_text_tokenizes = [tknz.tokenize(tweet) for tweet in tweet_texts]
## print(tweet_text_tokenizes[10])

In [9]:
## We need to process the words. The first step is Lemmatization: reducing a word to its root or canonical form.



from nltk.stem.wordnet import WordNetLemmatizer


## If you are missing the two modules, uncomment the code. 
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# The following function uses the pos_tag information to feed the lemmatizer (which can only operate when it 
## knows what part of speech the input word is)
# def lemmatize_sentence(tokens):
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_sentence = []
#     for word, tag in pos_tag(tokens):
#         if tag.startswith('NN'):
#             pos = 'n'
#         elif tag.startswith('VB'):
#             pos = 'v'
#         else:
#             pos = 'a'
#         lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
#     return lemmatized_sentence

# print(lemmatize_sentence(tweet_tokens[200]))

In [10]:
## NOW we need to clean up the lemmatized posts. 

## We do this via a function called remove_noise that begins by removing URLS (identified as a regular expression),
## and twitter handles (identified with a @ symbol). 

## We incorporate the lemmatization from the previous thing into this function. Note that we also have a 
## thing that filters out punctuation, as well as the so-called stop words,
##  which are in the typle of stop_words provided as the second argument)

import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens



In [11]:
## If you haven't got the NLTK module stopwords, you can uncomment the following line of code to get it. 

# nltk.download('stopwords')

In [12]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
    
covid_cleaned_tokens_list = []

    
## Here is an example of what remove noise does: the first list is the list of tokens in the 500th
## positive tweet, the second is what happens to that list after we delete all the 'junk'


print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


In [13]:
## We are going to use Na√Øve Bayes for our classification. The NB classifier in NLTK requires the 
## tweets to be converted to dictionary format, with the words as keys and True for each value.



def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
## Now lets label the training tweets as either Positive or Negative
import random
## we import the random module because we will randomly shuffle the dataset, 
## after it's been labeled, into a training chunk and a testing chunk

positive_dataset = [(tweet_dict,"Positive")
                   for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict,"Negative")
                   for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

# print("Accuracy is:", classify.accuracy(classifier, test_data))

# print(classifier.show_most_informative_features(20))

from nltk.tokenize import word_tokenize

def string_to_classification_verbose(string):
    tokens = remove_noise(word_tokenize(string))
    print('The classification is: ',classifier.classify(dict([token,True] for token in tokens)))
    probs = classifier.prob_classify(dict([token,True] for token in tokens))
    print('Negative probability ',probs.prob('Negative'))
    print('Positive probability ',probs.prob('Positive'))
    
def string_to_classification_terse(string):
    tokens = remove_noise(word_tokenize(string))
    return classifier.classify(dict([token,True] for token in tokens))

In [14]:

## Uncomment the line below to see how the string_to_classification_verbose function works

string_to_classification_verbose('people aren‚Äôt aware of this but HW assassinated 25 percent of Americans after the convention and that explains this shift')

The classification is:  Negative
Negative probability  0.8662386457283909
Positive probability  0.1337613542716073


In [15]:
# type(string_to_classification_terse('people aren‚Äôt aware of this but HW assassinated 25 percent of Americans after the convention and that explains this shift'))

In [16]:
## Now we can define our dataset 

partisan_tweets['sentiment'] = partisan_tweets.text.apply(lambda x: string_to_classification_terse(x))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partisan_tweets['sentiment'] = partisan_tweets.text.apply(lambda x: string_to_classification_terse(x))


In [17]:
## Lets see what we have: basically its the original partisan tweets dataset (including L/R labels)
## but augmented with a sentiment column)



print(partisan_tweets.head(10))

                                         user_name          user_location  \
66                                         Matty A       New York, NY USA   
164                                    ‚ùå Romulus ‚ùå                      ‚ùå   
178                                  BellaLunaTick  Occupied Shawnee Land   
257                             John Lichtenberger            Roxbury, NJ   
272                                       Jimbo360           Florida, USA   
310                             John Lichtenberger            Roxbury, NJ   
389                       The Soulsborne Waifu ‚öîÔ∏èüåô                    NaN   
476  TinkerLane üò∑ Mask/Handwashing/Distance ‚öΩüóΩüååüç∑üêãüåä          Virginia, USA   
481                               Samia Ali Salama      Pennsylvania, USA   
549                                      Ben Foerg            Los Angeles   

                                      user_description         user_created  \
66   üá∫üá∏  Happily Married & a Proud

In [18]:
## Lets cut our dataset down to 

df = partisan_tweets.filter(items = ['political_leaning','sentiment'])
print(df.head(10))

pd.crosstab(df.political_leaning,df.sentiment)

## Now we can do some statistics: the negativity ratio for liberal tweets is 544/536 ~ 1.02. 
## The negativity ratio for conservative tweets is 768/731 = 1.05. So the proportions are not
## much different. Possibly if we changed our classifier (there is a more rich array of bayesian
## classifiers available in the Scikit package) or if we expanded our partisan dictionaries, 
## we would find a difference. 

## It is somewhat noteworthy that 

    political_leaning sentiment
66                  R  Positive
164                 R  Negative
178                 R  Negative
257                 R  Negative
272                 R  Negative
310                 R  Positive
389                 R  Negative
476                 R  Negative
481                 L  Negative
549                 R  Positive


sentiment,Negative,Positive
political_leaning,Unnamed: 1_level_1,Unnamed: 2_level_1
L,553,527
R,762,737


In [19]:
## The total number of positive tweets is 536+731 = 1267, whereas there are 1312 negative tweets. 
## The pooled estimate of the proportion of negative tweets is 1312/(1312+1267) = 0.50872. 

## The standard error when we get is sqrt(.25(1/1080) + .25(1/1499)) = 0.01995643588

## The proportion of liberal tweets that are negative is 0.5037037037

## The proportion of conservative tweets that are negative is 0.51234156104

## The difference of the proportions is only 0.43 times the standard error - falls under the 1.96*SE threshold
## for significance! So we observe a difference in the (small) data set we filtered down to, but it is not a 
## statistically significant one. 