# Twarc Data Extraction

In [None]:
#twarc is a command line tool and Python library for collecting and archiving Twitter JSON data via the Twitter API
#!pip install twarc-csv
import json
import pandas as pd
from datetime import datetime, timezone, timedelta

from twarc.client2 import Twarc2
from twarc_csv import CSVConverter

# Bearer token from Twitter Developer should be here
t = Twarc2(bearer_token="XXX")


In [None]:
for x in range(10000,1500,-40):

    # Start and end times must be in UTC
    start_time = datetime.now(timezone.utc) + timedelta(minutes=-x)
    x = x - 5
    # end_time must to be at least 30 seconds ago.
    end_time = datetime.now(timezone.utc) + timedelta(minutes=-x)

    #a, the i, you are chosen as the searched words/ quotes and retweets eliminated and language is selected as eng
    query = " (a OR the OR i OR you) lang:en -is:quote -is:retweet -is:reply"

    print(f"Searching for \"{query}\" tweets from {start_time} to {end_time}...")

    # search_results is a generator, max_results is max tweets per page, not total, 100 is max when using all expansions.
    search_results = t.search_recent(query=query, start_time=start_time, end_time=end_time, max_results=100)

    # Get all results page by page:
    i = 0
    for page in search_results:
        while i < 6: 
        # Converting a python object into an equivalent JSON object:
            with open("XXX" + str(y) + ".jsonl", "w+") as f:
                f.write(json.dumps(page) + "\n")
            print("Wrote a page of results...")
            i += 1
        break

    print("Converting to CSV...")


    # Converting a JSON object into a CSV.
    with open("XXX" + str(y) + ".jsonl", "r") as infile:
        with open("XXX" + str(y) + ".csv", "w") as outfile:
            converter = CSVConverter(infile, outfile)
            converter.process()
            
    y = y + 1

    print("Finished.")

# CSV Merging and Creating the DataFrame

In [None]:
import pandas as pd
import glob
import numpy as np

In [None]:
path = r'XXX' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, encoding='utf-8')
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
df = df.loc[:, df.columns.isin(["id","conversation_id","author_id",
                           "created_at","text","lang","source","public_metrics.like_count",
                           "public_metrics.quote_count","public_metrics.reply_count","public_metrics.retweet_count",
                           "reply_settings","possibly_sensitive","entities.annotations","entities.cashtags","entities.hashtags",
                           "entities.mentions","entities.urls","context_annotations","attachments.media","attachments.poll.id",
                           "author.id","author.created_at","author.public_metrics.followers_count",
                           "author.public_metrics.listed_count","author.public_metrics.tweet_count","author.verified",
                           "geo.country","geo.country_code","__twarc.retrieved_at","__twarc.url","__twarc.version"])]
                                  
                           

In [None]:
# checking if there are any duplicates by id column
duplicateRowsDF = df[df.duplicated(['id'])]
print("Duplicate Rows based on a single column are:", duplicateRowsDF, sep='\n')

In [None]:
df.loc[df['geo.country_code'].notnull()]

In [None]:
df["time_diff"] = pd.to_datetime(df["__twarc.retrieved_at"]) - pd.to_datetime(df["created_at"])

df["time_diff_hours"] = ((pd.to_datetime(df["__twarc.retrieved_at"]) - pd.to_datetime(df["created_at"])).dt.total_seconds())/3600



In [None]:
#the diff between twarc extraction time and tweet creation time should be longer than 24 hours
df = df[df['time_diff_hours'] >= 24]  

In [None]:
# Plot frequency of users, sources, etc.
def plot_frequency_charts(df, feature, title):
    freq_df = pd.DataFrame()
    freq_df[feature] = df[feature]
    
    f, ax = plt.subplots(1,1, figsize=(16,4))
    total = float(len(df))
    g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))

    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 

    plt.title('Frequency of {} '.format(feature))
    plt.ylabel('Frequency', fontsize=12)
    plt.xlabel(title, fontsize=12)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
plot_frequency_charts(df, 'author.verified','author.verified')

In [None]:
#avg of engagement metrics depending on verification of author
df.groupby("author.verified", as_index=True)[['public_metrics.like_count', 'public_metrics.quote_count', 'public_metrics.reply_count', 
                                              'public_metrics.retweet_count']].mean()

In [None]:
#The account shouldn't represent or associated with a prominently recognized individual or brand
df = df[df['author.verified'] != 1]  

In [None]:
df.groupby("reply_settings", as_index=True)[['public_metrics.reply_count']].mean()

In [None]:
#The account shouldn't represent or associated with a prominently recognized individual or brand
df = df[df['reply_settings'] == 'everyone'] 

In [None]:
#disregard authors that have more than 10k followers
df = df[df['author.public_metrics.followers_count'] <= 10000]  

In [None]:
df.reset_index()

In [None]:
#checking the missing values
df.isnull().sum(axis = 0)

In [None]:
#looking the follower count of authers by filtering the data via iqr
Q1 = df['author.public_metrics.followers_count'].quantile(0.25)
Q3 = df['author.public_metrics.followers_count'].quantile(0.75)
IQR = Q3 - Q1    #IQR is interquartile range. 

filter = (df['author.public_metrics.followers_count'] >= Q1 - 1.5 * IQR) & (df['author.public_metrics.followers_count'] <= Q3 + 1.5 *IQR)
sns.boxplot(data=df, x=df['author.public_metrics.followers_count'].loc[filter], sym="red")
plt.show()

In [None]:
#create a total engagement column
column_names = ['public_metrics.like_count', 'public_metrics.retweet_count', 'public_metrics.reply_count']
df['Engagement_Total']= df[column_names].sum(axis=1)

In [None]:
#looking the follower total engagement by filtering the data via iqr
Q1 = df['Engagement_Total'].quantile(0.25)
Q3 = df['Engagement_Total'].quantile(0.75)
IQR = Q3 - Q1    #IQR is interquartile range. 


filter = (df['Engagement_Total'] >= Q1 - 1.5 * IQR) & (df['Engagement_Total'] <= Q3 + 1.5 *IQR)
sns.boxplot(data=df, x=df['Engagement_Total'].loc[filter], sym="red")

In [None]:
#visualizing flesch scores
filter = (df['flesch_reading'] >= -200)
sns.displot(data=df, x=df['flesch_reading'].loc[filter], kind="kde", bw_adjust=.25)


In [None]:
#examining why some texts are too complex
df.sort_values(by='flesch_reading', ascending=True)

# Preprocessing

In [None]:
df['text'].apply(str)

In [None]:
#installing ekphrasis
#!pip install ekphrasis

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons


In [None]:
import re
import string

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
    #annotate={"hashtag", "elongated", "repeated",
    #    'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words /“ahhhh.” “hmmmm”
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [None]:
def clean_text(text):
    text = text.lower()
    text = text.replace("’", "'")
    text = text.replace("“", "'")
    text = text.replace("”", "'")
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is ", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"it's", " it is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", "  are", text)
    text = re.sub(r"\'d", "  would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'bout", "about", text)
    
    #handling \n
    text = text.replace("\\n", " ")   
    
    # remove extra whitespace
    text = re.sub('\s+', ' ', text) 


    return text
df['text_preprocessed'] = df['text'].apply(clean_text)

In [None]:
def tokenize(text):

    text = " ".join(text_processor.pre_process_doc(text))
    text = re.sub(r'[%s]' % re.escape(''.join(string.punctuation)), r' ',text)
    return text

df['text_preprocessed'] = df['text_preprocessed'].apply(tokenize)

In [None]:
df[['text','text_preprocessed']].head(50)

In [None]:
#average length of words in tweet
df['avg_length_words'] = df["text_preprocessed"].apply(lambda x: np.mean([len(w) for w in x.split()]))

In [None]:
#emoji and word count
import emoji
import regex

def split_count(text):
    emoji_counter = 0
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
            emoji_counter += 1
            # Remove from the given text the emojis
            text = text.replace(word, '') 

    words_counter = len(text.split())

    return emoji_counter
df['emoji_count'] = df['text'].apply(split_count)

In [None]:
#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

In [None]:
df["text_preprocessed_nostopwords"] = df["text_preprocessed"].apply(lambda text: remove_stopwords(text))


In [None]:
# creating a column for created hour 
import dateutil.parser as p

def to_datetime(datestring):
    x = p.parse(datestring)
    return x.strftime("%H:%M")


In [None]:
df['created_at_hour'] = df['created_at'].apply(to_datetime)

In [None]:
#check if there is hashtags, mentions, urls, media, poll
def check_entities(x):
    if x != x:
        x = 0
    else:
        x = 1
    return x

In [None]:
df['check_hashtags'] = df['entities.hashtags'].apply(check_entities)
df['check_mentions'] = df['entities.mentions'].apply(check_entities)
df['check_urls'] = df['entities.urls'].apply(check_entities)
df['check_media'] = df['attachments.media'].apply(check_entities)
df['check_poll'] = df['attachments.poll.id'].apply(check_entities)

# EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df5 = df[["public_metrics.like_count", "public_metrics.quote_count", 
          "public_metrics.reply_count", "public_metrics.retweet_count", 
          "possibly_sensitive", "author.public_metrics.followers_count", 
          "author.public_metrics.listed_count",
          "author.public_metrics.tweet_count", 
          "emoji_count", "check_hashtags","mention_count","url_count","check_media",
          "check_poll","flesch_reading", "avg_length_words" ]]

In [None]:
plt.figure(figsize=(16, 6))
corr = df5.corr()
plt.figure(figsize=(12, 10))

sns.heatmap(df5.corr(), 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);


In [None]:
# Imports PIL module 
from PIL import Image
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS



def green_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    color = '#00ff00'    
    return color

#restricted = ['url', 'email', 'percent', 'money', 'phone', 'user',
#        'time', 'date', 'number'] 

#df_restricted = df[~df.text_preprocessed_nostopwords.isin(restricted)]

stop_words = ['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number']  + list(STOPWORDS)

logo = np.array(Image.open('XXX'))

dis_wc = WordCloud(stopwords=stop_words,
            collocations=False,
            background_color="Black",mask=logo).generate(' '.join(df['text_preprocessed_nostopwords']))
image_colors = ImageColorGenerator(logo)
dis_wc.recolor(color_func=green_color_func, random_state=3)

fig, ax1 = plt.subplots(figsize=(20,6))

ax1.imshow(dis_wc)
ax1.set_title("Word cloud for tweets", fontsize=20)
ax1.axis("off")

fig.show()

In [None]:
#checking the most common words
from collections import Counter
cnt = Counter()
for text in df["text"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [None]:
#checking the most common words after removing stopwords
from collections import Counter
cnt = Counter()
for text in df["text_preprocessed_nostopwords"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [None]:
df[(df['author.public_metrics.followers_count']>0) & (df['author.public_metrics.tweet_count']>1)][['public_metrics.like_count', 'public_metrics.reply_count', 'public_metrics.retweet_count', 'Engagement_Total']].describe()

In [None]:
df = df[(df['author.public_metrics.followers_count']>0) & (df['author.public_metrics.tweet_count']>1)]

In [None]:
df[['public_metrics.like_count', 'public_metrics.reply_count', 'public_metrics.retweet_count', 'Engagement_Total']].describe()

In [None]:
combined_df = df[['public_metrics.like_count','public_metrics.quote_count', 'public_metrics.reply_count', 'public_metrics.retweet_count']]

In [None]:
total_columns = combined_df.columns
num_col = combined_df._get_numeric_data().columns
cat_col = list(set(total_columns)-set(num_col))

In [None]:
describe_num_df = combined_df.describe(include=["int64","float64"])
describe_num_df.reset_index(inplace=True)

describe_num_df = describe_num_df[describe_num_df["index"] != "count"]
for i in num_col:
    if i in ["index"]:
        continue
    sns.factorplot(x="index", y=i, data=describe_num_df)
    plt.show()
    
    
    
t = [2,4]
df.loc[~df.index.isin(t)]

In [None]:
def to_datetime(datestring):
    x = p.parse(datestring)
    return x.strftime("%H:%M")
def to_date(datestring):
    x = p.parse(datestring)
    return x.strftime('%Y-%m-%d')

from datetime import datetime
#df_new = df
#split the time column into hour and date
#df_new['date'] = df_new['created_at']
df['date_1'] = df['created_at'].apply(to_date)
df['hour'] = df['created_at'].apply(to_datetime)

In [None]:
#add weekday columns 
df['day'] = pd.to_datetime(df.date_1, format='%Y-%m-%d %H:%M:%S').dt.weekday
df['day_name'] = pd.to_datetime(df.date_1, format='%Y-%m-%d %H:%M:%S').dt.day_name()

In [None]:
#add weekday columns 
df_new['day'] = df_new['date_1'].dt.weekday
df_new['day_name'] = df_new['date_1'].dt.day_name()

#pick only relevant columns 
week_df = df_new[['public_metrics.like_count','public_metrics.quote_count', 'public_metrics.reply_count', 'public_metrics.retweet_count',
                  'emoji_count', 'day','day_name', 'avg_length_words']]

# create a group by object
weekday_grouped = df_new.groupby('day_name')

#create a df with means sorted by day
week_mean = weekday_grouped.mean().sort_values(by="day")

In [None]:
def plot_means_by_weekday(dataframe, variable_1, variable_2=None, variable_3=None, variable_4=None):
    #create a figure
    fig, ax = plt.subplots(figsize=(10,8))

    # set x and y axes
    x = dataframe.index
    y1 = dataframe[variable_1]
    #plot
    ax.plot(x, y1, label=variable_1)
    
    #add other plots if variables given
    if variable_2 != None:
        y2 = dataframe[variable_2]
        ax.plot(x, y2, label=variable_2)
        
    if variable_3 != None:
        y3 = dataframe[variable_3]
        ax.plot(x, y3, label=variable_3)
        
    if variable_4 != None:
        y4 = dataframe[variable_4]
        ax.plot(x, y4, label=variable_4)

    #format title and labels
    plt.xticks(rotation=45)
    plt.title(f"Average engagement by the day of the week", fontsize=14)
    plt.legend()

In [None]:
plot_means_by_weekday(week_mean, 'public_metrics.retweet_count',
                      'public_metrics.like_count',
                      'public_metrics.quote_count','public_metrics.reply_count')




In [None]:
# create a group by object
weekday_grouped_1 = df_new.groupby(['day_name','day'])

#plot number of tweets and mean impressions
weekday_count = weekday_grouped_1.agg(['count','mean'])

#pick the impressions column and turn into a separate df
week_impressions = weekday_count['public_metrics.like_count']
#set index to be the day
week_impressions = week_impressions.reset_index()
week_impressions = week_impressions.set_index('day_name')
#sort day names
week_impressions = week_impressions.sort_values(by='day')

In [None]:
week_impressions

In [None]:
#plot correlations
fig, ax = plt.subplots(figsize=(15,15)) 
_ = sns.heatmap(corr, annot = True, ax=ax)
plt.title("Correlation matrix of engagement metrics", fontsize=16)
plt.show()

In [None]:
Q1 = df['Engagement_Total'].quantile(0.25)
Q3 = df['Engagement_Total'].quantile(0.75)
IQR = Q3 - Q1    #IQR is interquartile range. 

filter = (df['Engagement_Total'] >= Q1 - 1.5 * IQR) & (df['Engagement_Total'] <= Q3 + 1.5 *IQR)

In [None]:
sns.boxplot(data=df, x=df['Engagement_Total'].loc[filter], sym="red")
plt.show()

In [None]:
print(sorted(df['reply_settings'].unique()))


In [None]:
df_eda = df

# Texts range from 10 to 350 characters and generally, it is between 40 to 150 characters.
df_eda['text'].str.len().hist()

In [None]:
#counting url and mentions

words = ["url"]
df["url_count"] = df.text_preprocessed.apply(lambda x: sum([x.count(word) for word in words]))

words2 = ["user"]
df["mention_count"] = df.text_preprocessed.apply(lambda x: sum([x.count(word) for word in words2]))


# Emotion and Sentiment

In [None]:
import transformers
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [None]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


In [None]:
def translationPipeline(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    #for i in range(scores.shape[0]):
    l = labels[ranking[0]]
    s = scores[ranking[0]]
    output = f" {l} {np.round(float(s), 4)}"
    return output


In [None]:
df['text_preprocessed_emotion_scores'] = df['text_preprocessed'].apply(translationPipeline)

In [None]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [None]:
def translationPipeline(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    #for i in range(scores.shape[0]):
    l = labels[ranking[0]]
    s = scores[ranking[0]]
    output = f" {l}"
    return output

In [None]:
df['text_preprocessed_sentiment'] = df['text_preprocessed'].apply(translationPipeline)

In [None]:
df.to_csv('XXX')

In [None]:
plot_frequency_charts(df, 'text_preprocessed_sentiment','text_preprocessed_sentiment')

# Complexity

In [None]:
#!pip install textstat

In [None]:
import textstat

In [None]:
#Flesch Reading Ease Score. While the maximum score is 121.22, there is no limit on how low the score can be. A negative score is valid.

In [None]:
def readingscore(text):

    score = textstat.flesch_reading_ease(text)
    
    return score

df['flesch_reading'] = df['text_preprocessed_nostopwords'].apply(readingscore)


In [None]:
df["text_preprocessed_emotion_heading"] = np.nan
for i in range(df.shape[0]):
    if df['text_preprocessed_emotion_scores'].astype(str).str.split().str[1].astype(float)[i] < 0.5:
        df['text_preprocessed_emotion_heading'][i] = 'Unknown'
        print(i)
        print(df['text_preprocessed_emotion_heading'][i])
    else:
        df['text_preprocessed_emotion_heading'][i] = df['text_preprocessed_emotion_scores'].astype(str).str.split().str[0][i]
        print(i)
        print(df['text_preprocessed_emotion_heading'][i])