In [1]:
import os
import pandas as pd
import math
import nltk
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  
import re
from nltk.tokenize import WordPunctTokenizer
import pickle

In [2]:
def get_master_political_words():
    political_words = ['maga','trump', 'conservative','christian','🇺🇸','country','american','patriot','nra',
                        '2a','america','resist','president','politics','theresistance','veteran','kag','news',
                        'liberal','political','usa','vet','jesus','military','freedom','resistance','constitution',
                        'trump2020','army','potus','qanon','trumptrain','draintheswamp','independent','democrat',
                        'buildthewall','americafirst','republican','government','justice','history','prolife',
                        'law','navy','donald','progressive','liberty','israel','usaf','marine','constitutional','vets',
                        'libertarian','liberals','amendment','gop','bluewave','usmc','veterans','nation','backtheblue',
                        'atheist','patriots','q','vietnam','police','states','democracy','realdonaldtrump','bluewave2018',
                        'impeachtrump','bluelivesmatter','pro-life','constitutionalist','feminist','u.s.','notmypresident',
                        'genflynn','kag2020','politically','obama','flag','maga🇺🇸','brexit','americans','national',
                        'patriotic','voted','voter']
    return political_words

common_political_words = [
    'potus',
    'qanon',
    'trumptrain',
    'draintheswamp',
    'democrat',
    'buildthewall',
    'americafirst',
    'republican',
    'prolife',
    'maga',
    'trump',
    'conservative',
    'patriot',
    'nra',
    '2a',
    'resist',
    'politics',
    'theresistance',
    'liberal',
    'resistance',
    'constitution',
    'trump2020',
    'kag'
    ]


In [3]:
def load_master_id_data(file_name, col_name=None):
    sub_directories = '/Data/Master-Data/'
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name
    
    if col_name is not None:
        return pd.read_csv(full_path, usecols=[col_name])
    
    # print('Full Path: ', full_path)
    return pd.read_csv(full_path, header=0)

def clean_text(text):
    tok = WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words('english')
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    combined_pat = r'|'.join((pat1, pat2))

    stripped = re.sub(combined_pat, '', text)
    
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    words = [w for w in words if not w in stop_words]
        
    return (" ".join(words)).strip()


def clean_descriptions(df):
    # df.isnull().sum() 
    # About 40k users do not have a description
    df.dropna(subset=['user_description'], how='all', inplace = True)
    df['cleaned_description'] = df['user_description'].apply(lambda x: clean_text(x))
    return df


def find_n_most_common_words(df, col_name, n):
    # Find the most common political words in User Descriptions
    
    # These characters will appear as their own word if not removed. This results in the hashtag
    # being the most common word. These characters are removed from the results
    my_stop_words = [',', '’', '#', '.', '!', '@', '&', ':', '|', '(', ')', 
                 '\'s', ';', '-', 'n\'t', '%', '...', '\'m', 'http', 'https', 'de']

    # Read in the profile descriptions
    description_series = df[col_name]
    top_n = n

    # Lowercase everything in the description so words of different cases are counted the same
    # Ex: Maga and MAGA
    #
    lowercase_descriptions = description_series.str.lower().str.cat(sep=' ')
    words = nltk.tokenize.word_tokenize(lowercase_descriptions)
    word_dist = nltk.FreqDist(words)

    stopwords = nltk.corpus.stopwords.words('english')

    # Include the stop words I created
    stopwords.extend(my_stop_words)

    # Remove stopwords from the results
    words_except_stop_dist = nltk.FreqDist(w for w in words if w not in stopwords)

    results = pd.DataFrame(words_except_stop_dist.most_common(top_n), columns=['Word', 'Frequency']) #.set_index('Word')
    
    return results


def calculate_political_word_counts(df):
    # Calculate the number of political words in each user's description
    
    political_words = get_master_political_words()
    
    political_word_counts = []

    for index, row in df.iterrows():
        description_words = row['cleaned_description'].split()
        
        political_words_left = list(set(political_words) - set(description_words))
        political_word_count = len(political_words) - len(political_words_left)
        political_word_counts.append(political_word_count)
        
    df['political_word_count'] = political_word_counts
    return df
    
    
def show_statistics(col_name, text_for_print, df):
    
    # Calculate the average 
    average = sum(df[col_name]) / len(df)
    print(col_name + ' average: ' + str(round(average, 2)))

    # Calculate standard deviation
    std = df[col_name].std()
    print(text_for_print + ' standard deviation ' + str(std))

    # Calculate the range 
    max = df[col_name].max()
    min = df[col_name].min()
    print(text_for_print, " ranges from ", str(min), " - ", str(max))
    
    # Calculate the number of rows with zero
    zero_df = df[df[col_name] == 0]
    print(text_for_print, " has ", str(len(zero_df)), ' rows with zero as their value')
    

def get_political_users(df, min_val, equal=False):
    if equal:
        df = df[df.political_word_count >= min_val]
    else:
        df = df[df.political_word_count > min_val]
    return df


def get_top_5_percent_cap_scores(df):
    n = int(0.05 * len(df))
    print('n: ', n)
    top_five = df['cap'].nlargest(n)
    return top_five


def get_top_5_bot_scores(df):
    n = int(0.05 * len(df))
    print('n: ', n)
    top_five = df['bot_score'].nlargest(n)
    return top_five


def get_min_bot_cap_score(df):
    top = get_top_5_percent_cap_scores(df)
    lowest = list(top)[-1]
    print('lowest: ', lowest)
    return lowest


def get_min_bot_score(df):
    top = get_top_5_bot_scores(df)
    lowest = list(top)[-1]
    print('lowest: ', lowest)
    return lowest


def get_top_5_cap_scores_df(df):
    lowest = get_min_bot_cap_score(df)
    return df[df.cap > lowest]
    
    
def get_top_5_bot_scores_df(df):
    lowest = get_min_bot_cap_score(df)
    return df[df.bot_score >= lowest]


def print_to_text_file(file_name, data, with_new_line=True):
    file_name += ".txt"
    with open(file_name, "w") as text_file:
        for x in data:
            if with_new_line:
                text_file.write("{}\n".format(x))
            else:
                text_file.write("\'{}\',\n".format(x))
        
        text_file.close()
        print('Finished creating ', file_name)
        return
    

def save_to_pickle(file_name, data):
    file_name += '.pkl'
    with open(file_name, 'wb') as f:
        pickle.dump(data, f)
        

def open_from_pickle(file_name):
    file_name += '.pkl'
    with open(file_name, 'rb') as f:
        data = pickle.load(f)
        return data

def save_df_to_csv(df, file_name):
    file_name += '.csv'
    df.to_csv(file_name, encoding='utf-8', index=False)

In [4]:
# df = load_master_id_data('MasterIDs.csv') 
df = load_master_id_data('MasterIDs-2.csv')
df.shape # total number of accounts scraped

(241300, 28)

In [6]:
show_statistics('cap', 'cap score', df)

cap average: 0.06
cap score standard deviation 0.1322926375380892
cap score  ranges from  0.001556878998692371  -  0.9670261466915276
cap score  has  0  rows with zero as their value


In [7]:
df = clean_descriptions(df)

In [8]:
# most_common_description_words = find_n_most_common_words(df, 'user_description', 500)
# word_list = most_common_description_words['Word'].tolist()
# print_to_text_file("most_common_words_as_list", word_list, with_new_line=False)

In [9]:
df = calculate_political_word_counts(df)

In [10]:
show_statistics('political_word_count', 'political word count', df)

political_word_count average: 1.42
political word count standard deviation 1.7943026474578256
political word count  ranges from  0  -  13
political word count  has  84722  rows with zero as their value


In [11]:
# political_df = get_political_users(df)
political_df = get_political_users(df, 1, equal=True)

In [12]:
political_df.shape # First number is the total number of accounts

(106825, 30)

In [13]:
show_statistics('cap', 'cap score', political_df)

cap average: 0.05
cap score standard deviation 0.09838781503735348
cap score  ranges from  0.001556878998692371  -  0.9670261466915276
cap score  has  0  rows with zero as their value


In [14]:
political_bots = political_df[political_df.cap > 0.53]
def_political_bots = political_df[political_df.cap >= 0.8]
political_humans = political_df[political_df.cap < 0.3]

print(len(political_bots))
print(len(def_political_bots))
print(len(political_humans))

1222
134
103437


In [30]:
save_df_to_csv(political_humans, 'political_human_profiles(1-word)')

In [14]:
political_bot_ids = political_bots.user_id.tolist()
save_to_pickle('political_bot_ids(1-word)', political_bot_ids)

In [24]:
save_df_to_csv(political_bots, 'political_bot_profiles(1-word)')

In [16]:
show_statistics('cap', 'cap score', political_bots)

cap average: 0.66
cap score standard deviation 0.10604601853488954
cap score  ranges from  0.5319634089641295  -  0.9670261466915276
cap score  has  0  rows with zero as their value


In [17]:
show_statistics('cap', 'cap score', def_political_bots)

cap average: 0.87
cap score standard deviation 0.045168078210361264
cap score  ranges from  0.8055519629117155  -  0.9670261466915276
cap score  has  0  rows with zero as their value


In [94]:
# def_bot_usernames = def_political_bots.user_screen_name.tolist()
# print_to_text_file("def_bot_usernames", def_bot_usernames)

In [21]:
def show_most_common_political_words(df):
    # Read in the descriptions
    description_series = df['cleaned_description']
    top_n = 50

    # Lowercase everything in the description so words of different cases are counted the same
    # Ex: Maga and MAGA
    #
    lowercased = description_series.str.lower().str.cat(sep=' ')
    words = nltk.tokenize.word_tokenize(lowercased)
    word_dist = nltk.FreqDist(words)

    return pd.DataFrame(word_dist.most_common(top_n), columns=['Word', 'Frequency'])#.set_index('Word')

In [22]:
most_common = show_most_common_political_words(political_bots)
print(most_common)

             Word  Frequency
0            maga        426
1           trump        343
2    conservative        141
3            news        134
4             kag        118
5            love        111
6             god        108
7             nra        105
8       christian        102
9         america         97
10          proud         92
11        patriot         90
12       military         82
13       american         74
14        country         72
15      president         67
16         family         66
17      supporter         63
18         follow         60
19       politics         57
20            pro         55
21  draintheswamp         55
22   americafirst         54
23        support         54
24     trumptrain         53
25           life         53
26   buildthewall         51
27        prolife         51
28             fb         48
29        married         46
30            vet         44
31        veteran         42
32             us         41
33            