In [156]:
%%writefile IRA_tweets.py
import pandas as pd
import os

class IRA_tweets:
    def __init__(self):
        
        self.intersect = ['hashtags', 'is_retweet', 'quote_count', 'reply_count', 'retweet_count']
        
        # load column names from ira data and matched column names from rtweet data
        self.ira_cols = ['account_creation_date', 'account_language', 'follower_count',
           'in_reply_to_tweetid', 'in_reply_to_userid',
           'quoted_tweet_tweetid', 'retweet_tweetid', 'retweet_userid',
           'tweet_language', 'tweet_text', 'tweet_time',
           'tweetid', 'urls', 'user_display_name', 'user_mentions',
           'user_profile_description', 'user_profile_url',
           'user_reported_location', 'user_screen_name', 'userid']

        self.rtweet_cols=['account_created_at', 'account_lang','followers_count','reply_to_status_id', 'reply_to_user_id',
            'quoted_status_id', 'retweet_status_id', 'retweet_user_id','lang','text','created_at','status_id',
            'urls_expanded_url','name','mentions_user_id','description','profile_expanded_url','location',
            'screen_name','user_id']
        
        # full
        self.rtweet_cols_fullset = pd.read_csv('samplehl.csv',nrows=1).columns
        
        self.data = pd.read_csv('ira_tweets_csv_hashed.csv',
                usecols=self.intersect+self.ira_cols)#, nrows=100000)

        #if 'tweet_time' in columns_of_interest: #convert to datetime format, include only date (not time)
        self.data['tweet_time'] = pd.to_datetime(self.data['tweet_time'])
        
    def english_only_tweets(self):
        def filter_out_nonenglish_tweets(df):
            df=df[(df['account_language'] == 'en') & (df['tweet_language'] == 'en')]
            return df

        self.data = (self.data.pipe(filter_out_nonenglish_tweets).
            drop(['account_language','tweet_language'],1))
        
    def round_dates_to_weekstarting(self):
        def round_dates_to_week_starting_function(df):
            df['tweet_time'] = df['tweet_time'] - pd.Timedelta('1d') * df['tweet_time'].dt.dayofweek
            df['tweet_time'] = pd.to_datetime(df['tweet_time'].dt.date)
            return df

        self.data = self.data.pipe(round_dates_to_week_starting_function)
        
        
    def get_number_of_weeks_account_posted(self):
        self.number_of_weeks_account_posted = (self.data.groupby(['user_screen_name'],as_index=False).
             agg({'tweet_time':'nunique'}).
            sort_values('tweet_time',ascending=False))

        return self.number_of_weeks_account_posted
    
    
    def tweet_counts_by_account_and_week(self,top_IRA_accounts):
        def get_tweet_counts(df):
            return (df[df['user_screen_name'].isin(top_IRA_accounts)].
                    groupby(['tweet_time','user_screen_name'],as_index=False).
                     size().reset_index().sort_values(['user_screen_name','tweet_time']))

        self.tweet_counts = self.data.pipe(get_tweet_counts)

        return self.tweet_counts
    
    def save_data_for_bot_test(self,ta):
        def reformat_account_record_rtweet(df,ta):
            df=df[df['user_screen_name'] == ta]
            new_columns = df.columns.values
            for n,o in zip(self.rtweet_cols, self.ira_cols): new_columns[new_columns==o] = n
            df.columns=new_columns

            df=df.loc[:,self.rtweet_cols_fullset]
            return df

        directory = 'bot_detection/'
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.data.pipe(reformat_account_record_rtweet,ta).to_csv(directory+ta)
        
    def get_top_accounts_with_botornot_score(self,fn,top_IRA_accounts):
        ## load probability each account is a bot
        botornot = pd.read_csv(fn,index_col=0).loc[top_IRA_accounts]
        top_IRA_accounts = botornot[~botornot.isnull().all(1)].index

        return top_IRA_accounts

    def create_wide_dataframe(self,tweet_counts):
    
        #create wide dataframe
        def widen_and_impute(df):
            df=df.pivot(index='tweet_time',columns='user_screen_name',values=0)

            #range of dates at weekly intervals
            periods =  ((df.index.max() - df.index.min())/7).days + 1
            idx = pd.date_range(df.index.min(), df.index.max(),periods =periods)

            #add missing weeks and replace missing values with zeros
            df=df.reindex(idx).fillna(0)

            return df.T

        return tweet_counts.pipe(widen_and_impute)

Overwriting IRA_tweets.py


In [157]:
%%writefile run_IRA_tweets.py
import IRA_tweets
import pandas as pd
import matplotlib.pyplot as plt
import os

self = IRA_tweets.IRA_tweets()
self.english_only_tweets()
self.round_dates_to_weekstarting()

number_of_weeks_posted = self.get_number_of_weeks_account_posted()

#accounts that tweeted in 100 or more weeks
top_IRA_accounts = (number_of_weeks_posted[number_of_weeks_posted['tweet_time'] > 100]
                    ['user_screen_name'].values)

#get non-anonymized accountsgit br (short names)
top_IRA_accounts = [w for w in top_IRA_accounts if len(w) < 20] 

# save log for high volume Twitter accounts in rtweet format
#for ta in top_IRA_accounts:#[1]
#    self.save_data_for_bot_test(ta)

fn = 'botornot.csv' # path to botornot probabilities
top_IRA_accounts = self.get_top_accounts_with_botornot_score(fn,top_IRA_accounts)

tweet_counts = self.tweet_counts_by_account_and_week(top_IRA_accounts)
df_ts = self.create_wide_dataframe(tweet_counts)

df_ts.to_csv('IRA_timeseries')



### plot and save rough pdfs of time series 
for name in top_IRA_accounts:

    x=tweet_counts[tweet_counts['user_screen_name'] == name]
    x.index=x['tweet_time']
    x=x[0]

    f,ax = plt.subplots()
    
    ax.plot(x,'o-')
    ax.set_title(name)
    
    plot_directory = 'timeseries_plots/'
    if not os.path.exists(plot_directory):
        os.makedirs(plot_directory)
        
    plt.savefig(plot_directory + name + '.pdf')
    
    plt.close()

Overwriting run_IRA_tweets.py


In [143]:
top_IRA_accounts

['Jenn_Abrams',
 'DanaGeezus',
 'KansasDailyNews',
 'ChrixMorgan',
 'GiselleEvns',
 'DailySanJose',
 'DailySanFran',
 'DailySanDiego',
 'PhoenixDailyNew',
 'Seattle_Post',
 'SanAntoTopNews',
 'LoraGreeen',
 'NewOrleansON',
 'AmandaVGreen',
 'IlikeBIGbuttand',
 'NotRitaHart',
 'WashingtOnline',
 'ChicagoDailyNew',
 'OnlineCleveland',
 'HoustonTopNews',
 'TodayPittsburgh',
 'todayinsyria',
 'gloed_up',
 'TodayNYCity',
 'DetroitDailyNew',
 'OaklandOnline',
 'StLouisOnline',
 'BleepThePolice']

In [147]:
tweet_counts['user_screen_name'].unique()

array(['DailySanFran', 'Jenn_Abrams', 'KansasDailyNews', 'SanAntoTopNews',
       'Seattle_Post'], dtype=object)