In [4]:
import json
import gzip
from pymongo import MongoClient
import pandas as pd
import time
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

mongo_client = MongoClient()
verbose_mode_on = True

In [5]:
%run supporting_functions.ipynb

In [14]:
def get_preferred_user_for_user(data_to_use):
#     print("distinct preferred authors are:", list(set(data_to_use['retweeted_status_user_id_str_h'])))
    rows_of_data = []
    users_distinct = data_to_use[['user_id']].drop_duplicates()
    for user in users_distinct['user_id']:
        temp_df = data_to_use[data_to_use['user_id'] == user]
        retweets_of_user = temp_df[data_to_use['retweet_flag'] == True]
        if len(retweets_of_user) == 0:
            rows_of_data.append([user, []])
        else:
            distinct_preferred_authors = list(set(retweets_of_user['retweeted_status_user_id_str_h']))
            rows_of_data.append([user, distinct_preferred_authors])
    df_ret = pd.DataFrame(rows_of_data, columns=['user_id', 'preferred_authors_list'])
    return df_ret

def get_tweet_data_for_a_period(data, start_timestamp_ms, end_timestamp_ms):
    train_data = data[(data['timestamp_ms'] >= start_timestamp_ms) & (data['timestamp_ms'] <= end_timestamp_ms)]
    return train_data

def get_preferred_user_retweet_stats(twitter_data, start_time, end_time):
    pass

'''
input: tweet_dataframe, train_start_ts_ms, window_num_days
output: for every test user in the test-window(30 days starting from train_start_ts_ms + 30 days) get:
number of total tweets made ("tot_Tweets")
number of total Retweets made ("tot_RT")
number of Retweets of Preferred Authors(this for each test user is obtained from the train period)- ("pref_auth_RT_freq")

'''
def find_retweet_freq_preferred_user(tweet_dataframe, train_start_ts_ms, window_num_days):
    train_end_ts_ms = train_start_ts_ms + window_num_days*86400000
    test_start_ts = train_end_ts_ms
    test_end_ts = (train_end_ts_ms + window_num_days*86400000)
    train_data = get_tweet_data_for_a_period(tweet_dataframe, train_start_ts_ms, train_end_ts_ms)
    test_data = get_tweet_data_for_a_period(tweet_dataframe, test_start_ts, test_end_ts)
    train_preferred_users_data = get_preferred_user_for_user(train_data)
    train_test_user_intersect = list(set(test_data['user_id']).intersection(train_preferred_users_data['user_id']))
    test_users = test_data[['user_id']].drop_duplicates()['user_id']
#     print("test users", test_users)
    data_list = []
    for user in test_users:
        dd = test_data[test_data['user_id'] == user]
        tot_Tweet = dd.shape[0]
    #     print("tot_Tweet:", tot_Tweet)
        dd_RT = dd[dd['retweet_flag'] == True]
        tot_RT = dd_RT.shape[0]
    #     print("tot_RT:", tot_RT)
        if (user in train_test_user_intersect) and (dd_RT.shape[0] != 0):
    #         print("an intersect user")
            pref_authors = train_preferred_users_data[train_preferred_users_data['user_id'] == user]['preferred_authors_list']
            dd_sub = dd_RT[dd_RT['retweeted_status_user_id_str_h'] != 0]
    #         print("dd sub ", dd_sub)
            if dd_sub.shape[0] and len(pref_authors):
                num = len(set(dd_sub[['retweeted_status_user_id_str_h']]['retweeted_status_user_id_str_h']).intersection(set(list(pref_authors)[0])))
    #             print("pref auth RT:", num)
                data_list.append([user, num, tot_RT, tot_Tweet])
        else:
            data_list.append([user, 0, tot_RT, tot_Tweet])
    test_RT_freq = pd.DataFrame(data_list, columns = ['user_id', 'pref_auth_RT_freq', 'tot_RT_freq', 'tot_Tweet']) 
    return test_RT_freq
        

In [6]:
# ## code to run once ##
database = "Nov19-test"  # sys.argv[2]
filename = "/Users/divyachoudhary/Documents/Work-ISI/RetweetNetwork/Tng_an_CVE_Twitter-mention.json.gz"
if not db_exists(database):
    create_db(database)
else:
    delete_db(database)
    create_db(database)
data = []
with gzip.GzipFile(filename, "r") as fp:
    for line in fp:
        data.append(json.loads(line))
db = mongo_client[database]["Twitter_CVE"]
db.insert_many(data)
# ## code to run once ##
# using the data and creating pandas data frame 
client = MongoClient()
database = client['Nov19-test']
collection = database.list_collection_names()
cve = database.Twitter_CVE
start = time.time()
mongo_json_twitter_cve=list(cve.find())
normalized = pd.io.json.json_normalize(mongo_json_twitter_cve)
end = time.time()
print("time taken", end - start)
twitter_cve_data = pd.DataFrame(normalized)

Database Nov19-test was dropped.
New database created, test collection added. 5e1ce1a4c471a20df7169e2c
time taken 377.6765100955963


In [10]:
# Get selected columns of interest
list_of_rows = []
error_indices = []
for i in range(len(twitter_cve_data)):
    try:
        list_of_rows.append(list(get_tweet_features_per_row(twitter_cve_data.loc[i])))
    except ValueError as err:
        error_indices.append(i)
        print("there is value error at i = ", i)
        continue
df = pd.DataFrame(list_of_rows, columns=['user_id','user_screen_name','user_tz', 'user_statuses_count', 'user_location', 'user_listed_count', 'user_lang', 'user_friends_count', 'user_followers_count', 'user_following', 'user_favourites_count', 'user_description_m', 'user_created_at', 'id_str_h', 'lang', 'place', 'content_language', 'created_at', 'hashtags', 'tweet_has_media', 'symbols', 'tweet_has_symbol', 'user_mentions','tweet_has_user_mentions','extension_created_dow', 'extension_created_hod', 'retweeted_status_user_id_str_h', 'quoted_status_id_str_h', 'quoted_status_user_id_str_h', 'text_m', 'timestamp_ms', 'retweeted_status_text_m'])
# Retweet related 
df['retweeted_status_user_id_str_h'] = df['retweeted_status_user_id_str_h'].fillna(value = 0)
df['retweet_flag'] = df.apply(lambda r: False if r['retweeted_status_user_id_str_h'] == 0 else True, axis = 1)
# Quoted Tweet related
df['quoted_status_id_str_h'] = df['quoted_status_id_str_h'].fillna(value = 0)
df['quoted_status_user_id_str_h'] = df['quoted_status_user_id_str_h'].fillna(value = 0)
df['quoted_status_flag'] = df.apply(lambda r: False if r['quoted_status_id_str_h'] == 0 else True, axis = 1)
print("final data frame created for the given collection with shape: {}".format(df.shape))
df = df.astype({'timestamp_ms': 'int'})
df = df.sort_values(by=['timestamp_ms'])

there is value error at i =  87412
final data frame created for the given collection with shape: (259059, 34)


#### So for each(user_id) of test users we find:
1. number of total tweets made ("tot_Tweets")
2. number of total Retweets made ("tot_RT")
3. number of Retweets of Preferred Authors(this for each test user is obtained from the train period)- ("pref_auth_RT_freq")

In [16]:
# Calling the function to generate results
test_RT_freq = find_retweet_freq_preferred_user(df,1467334801000, 30)
test_RT_freq

  import sys


Unnamed: 0,user_id,pref_auth_RT_freq,tot_RT_freq,tot_Tweet
0,CBTTj9CvxZibceOJ4fzqgw,2,2,3
1,F_97ZY08XOdiQt17GAq5RQ,1,1,1
2,Z8mnX_0zxhpqPBu4nU8aug,1,1,2
3,Bk-RENd6922jWZaWcw--Bg,0,1,1
4,2BUkrCYkslCFjlMfjy4EBg,0,1,1
5,PUA5YuoNke74f_QsbrJ0Vg,0,0,4
6,L1Zx7oz3Orn80-KCHxkxsA,0,0,1
7,07e1OTOC7EOD186hPQRywQ,0,1,1
8,1IdxdFI_JGnuFg2jRUgVpA,0,0,3
9,rW6xxTD_Y9GoirgPLYrH4g,0,0,152
