In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
import pandas as pd
df = pd.read_csv("tweets.csv")

[nltk_data] Downloading package stopwords to /home/xih037/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/xih037/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#get df for each other
hc = df[df["handle"] == "HillaryClinton"][["text","is_retweet","time"]]
dt = df[df["handle"] == "realDonaldTrump"][["text","is_retweet","time"]]
f'Hillary have {hc.shape[0]} tweets, Trump have {dt.shape[0]} tweets. {((hc["is_retweet"].sum()/hc.shape[0]) * 100).round(2)}% of Hillary tweets are retweets, {((dt["is_retweet"].sum()/dt.shape[0]) * 100).round(2)}% of Trump tweets are retweets'


'Hillary have 3226 tweets, Trump have 3218 tweets. 18.51% of Hillary tweets are retweets, 3.88% of Trump tweets are retweets'

In [3]:
#remove retweet
df = df[df["is_retweet"] == False]

In [4]:
#cleaned text
def clean_and_tokenize(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove usernames
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w+', '', tweet)
    # Remove punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # Tokenize by splitting on whitespace
    tokens = tweet.split()
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    return tokens
hc["text_cleaned"] = hc["text"].apply(clean_and_tokenize)
dt["text_cleaned"] = dt["text"].apply(clean_and_tokenize)

In [5]:
hc_words = [token for word in hc["text_cleaned"] for token in set(word) if token not in stop_words]
hc_counts = Counter(hc_words)
hc_most_common_word = hc_counts.most_common(100000000)
hc_summary = pd.DataFrame(hc_most_common_word, columns=['Word', 'Frequency'])
hc_summary["percent"] = hc_summary["Frequency"]/hc.shape[0]
hc_summary

Unnamed: 0,Word,Frequency,percent
0,trump,703,0.217917
1,hillary,679,0.210477
2,donald,414,0.128332
3,president,263,0.081525
4,trumps,197,0.061066
...,...,...,...
5981,luchan,1,0.000310
5982,renta,1,0.000310
5983,afectadas,1,0.000310
5984,afroamericanas,1,0.000310


In [6]:
dt_words = [token for word in dt["text_cleaned"] for token in set(word) if token not in stop_words]
dt_counts = Counter(dt_words)
dt_most_common_word = dt_counts.most_common(10000000)
dt_summary = pd.DataFrame(dt_most_common_word, columns=['Word', 'Frequency'])
dt_summary["percent"] = dt_summary["Frequency"]/dt.shape[0]
dt_summary

Unnamed: 0,Word,Frequency,percent
0,thank,517,0.160659
1,great,440,0.136731
2,hillary,326,0.101305
3,trump,326,0.101305
4,amp,240,0.074580
...,...,...,...
5217,4000,1,0.000311
5218,lowell,1,0.000311
5219,crucialonly,1,0.000311
5220,dependency,1,0.000311


In [7]:
#exclamation mark
hc_exclamation = hc[hc["text"].str.contains("!", case=False, na=False)].shape[0]
dt_exclamation = dt[dt["text"].str.contains("!", case=False, na=False)].shape[0]
f'{hc_exclamation / hc.shape[0] * 100}% of Hillary tweets have ！mark, {dt_exclamation / dt.shape[0]* 100}% of Trump tweets have ！mark'

'4.153750774953503% of Hillary tweets have ！mark, 62.83405842137974% of Trump tweets have ！mark'

In [8]:
#quote of their own
hc_quote = hc[hc["text"].str.contains("—hillary", case=False, na=False)].shape[0]
dt_quote = dt[dt["text"].str.contains("- trump", case=False, na=False)].shape[0]
f'{hc_quote / hc.shape[0] * 100}% of Hillary tweets is quoting of her own words, {dt_quote / dt.shape[0]* 100}% of Trump tweets is quoting of his words'


'9.702417854928704% of Hillary tweets is quoting of her own words, 0.06215040397762585% of Trump tweets is quoting of his words'

In [9]:
#quote of each other
hc_qo = hc[hc["text"].str.contains("—Trump", case=False, na=False)].shape[0]
dt_qo = dt[dt["text"].str.contains("-hiliary", case=False, na=False)].shape[0]
f'{hc_qo / hc.shape[0] * 100}% of Hillary tweets is quoting of Trump, {dt_qo / dt.shape[0]* 100}% of Trump tweets is quoting of Hillary'

'0.24798512089274644% of Hillary tweets is quoting of Trump, 0.0% of Trump tweets is quoting of Hillary'

In [10]:
##campaign slogan
hc_slogan = hc[hc["text"].str.contains("Stronger Together", case=False, na=False)].shape[0]
dt_slogan = dt[dt["text"].str.contains("maga",case=False, na=False)].shape[0]
f'{hc_slogan/ hc.shape[0] * 100}% of Hillary tweets mentioned Stronger Together, {dt_slogan / dt.shape[0]* 100}% of Trump tweets meantioned MEGA'


'0.8679479231246126% of Hillary tweets mentioned Stronger Together, 2.050963331261653% of Trump tweets meantioned MEGA'

In [11]:
#trump immigration promise
immig_words = ["border", "deport", "mexico", "wall", "refugee","illegal"]
dt_immig = dt_summary[dt_summary['Word'].apply(lambda tweet: any(word in tweet.lower() for word in immig_words))]
dt_immig_words = [i for i in dt_immig["Word"]]
dt_immig_p = dt_immig.sum()
hc_immig = hc_summary[hc_summary['Word'].apply(lambda tweet: any(word in tweet.lower() for word in immig_words))]
hc_immig_words = [i for i in hc_immig["Word"]]
hc_immig_p = hc_immig.sum()
f'About immigration, Trump often use {dt_immig_words} and it consists of {dt_immig_p.loc["percent"]*100} of his tweets,Hillary often use {hc_immig_words} and it consists of {hc_immig_p.loc["percent"]*100} of her tweets, one of it is about Lets imagine a tomorrow in which no child grows up under the shadows of discrimination or deportation'


"About immigration, Trump often use ['wall', 'illegal', 'mexico', 'border', 'borders', 'refugees', 'illegals', 'illegally', 'wallace', 'refugee', 'interviewall', 'borderless', 'prowall', 'deportation', 'wallsis', 'walls', 'wallet'] and it consists of 4.630205096333126 of his tweets,Hillary often use ['wall', 'walls', 'deport', 'mexico', 'deportation', 'deported', 'illegal', 'deportar', 'stonewall', 'deportación', 'refugees', 'wallet', 'wallinstead', 'walldid', 'deporting', 'refugee', 'border', 'deportations'] and it consists of 2.5728456292622446 of her tweets, one of it is about Lets imagine a tomorrow in which no child grows up under the shadows of discrimination or deportation"

In [12]:
#trump tax cut
tax_words = ["tax"]
dt_tax = dt_summary[dt_summary['Word'].apply(lambda tweet: any(word in tweet.lower() for word in tax_words))]
dt_tax_words = [i for i in dt_tax["Word"]]
dt_tax_p = dt_tax.sum()
hc_tax = hc_summary[hc_summary['Word'].apply(lambda tweet: any(word in tweet.lower() for word in tax_words))]
hc_tax_words = [i for i in hc_tax["Word"]]
hc_tax_p = hc_tax.sum()
f'About immigration, Trump often use {dt_tax_words} and it consists of {dt_immig_p.loc["percent"]*100} of his tweets,Hillary often use {hc_tax_words} and it consists of {hc_tax_p.loc["percent"]*100} of her tweets, one of it is about Last night, Donald Trump said not paying taxes was "smart." You know what I call it? Unpatriotic.'



'About immigration, Trump often use [\'tax\', \'taxes\', \'taxpayers\', \'overtaxed\', \'overtaxes\'] and it consists of 4.630205096333126 of his tweets,Hillary often use [\'tax\', \'taxes\', \'taxpayers\', \'taxpayer\', \'taxesand\'] and it consists of 2.851828890266584 of her tweets, one of it is about Last night, Donald Trump said not paying taxes was "smart." You know what I call it? Unpatriotic.'

In [13]:
# hillary health promise(need to be modified)
health_words = ["heath","affordable","care"]
dt_health = dt_summary[dt_summary['Word'].apply(lambda tweet: any(word in tweet.lower() for word in health_words))]
dt_health_words = [i for i in dt_health["Word"]]
dt_health_p = dt_health.sum()
hc_health = hc_summary[hc_summary['Word'].apply(lambda tweet: any(word in tweet.lower() for word in health_words))]
hc_health_words = [i for i in hc_health["Word"]]
hc_health_p = hc_health.sum()
f'About health plan, Trump often use {dt_health_words} and it consists of {dt_immig_p.loc["percent"]*100}% of his tweets,Hillary often use {hc_health_words} and it consists of {hc_health_p.loc["percent"]*100} of her tweets.'



"About health plan, Trump often use ['care', 'obamacare', 'healthcare', 'careful', 'cares', 'career', 'scared', 'childcare', 'ocare', 'careless', 'scare', 'careers', 'pledgecareful'] and it consists of 4.630205096333126% of his tweets,Hillary often use ['care', 'affordable', 'career', 'cares', 'scared', 'careerfocused', 'scares', 'careerand', 'childcare', 'medicare', 'heath', 'scare'] and it consists of 2.9138251704897704 of her tweets."