# Extract 4 features from dataset
1. Visibility
2. favorites_count / statues_count
3. followers_count / friends_count
4. Frequency of tweets published since the creation of the account

In [4]:
import numpy as np
import pandas as pd
from pymongo import MongoClient
np.set_printoptions(suppress=True)
#consider inf and -inf to be “NA” in computations
pd.options.mode.use_inf_as_na = True

client = MongoClient('localhost', 27017)
db = client['if29']
collection = db['Tweet Worldcup 200']

In [2]:
cursor = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            "friends_count" : {"$last" : "$user.friends_count"},
            "listed_count" : {"$last" : "$user.listed_count"},
            "favourites_count" : {"$last" : "$user.favourites_count"},
            "statuses_count" : {"$last" : "$user.statuses_count"},
            "followers_count" : {"$last" : "$user.followers_count"},
            "created_time" : {"$last" : "$user.created_at"}
        }
        }
    ]
)
df = pd.json_normalize(cursor)

In [3]:
df

Unnamed: 0,_id,friends_count,listed_count,favourites_count,statuses_count,followers_count,created_time
0,758966916593496064,49,0,75,446,28,Fri Jul 29 10:06:28 +0000 2016
1,1140595902,2566,0,7625,5672,2838,Fri Feb 01 20:03:57 +0000 2013
2,899575572732551168,533,0,15472,8576,176,Mon Aug 21 10:15:04 +0000 2017
3,353576074,1086,0,661,714,289,Fri Aug 12 08:59:38 +0000 2011
4,2598356918,4996,1,25639,8706,1071,Tue Jul 01 18:11:48 +0000 2014
...,...,...,...,...,...,...,...
267462,3951606867,559,0,39,668,39,Wed Oct 14 01:52:09 +0000 2015
267463,2376538666,190,0,128,23052,535,Sun Mar 02 17:21:26 +0000 2014
267464,14877969,297,4,1,6017,161,Fri May 23 05:26:32 +0000 2008
267465,2406883352,1070,14,266,27934,2304,Sun Mar 23 12:20:44 +0000 2014


## Feature constuct (vis, r_fri_follow, avg_fav, frequency)

### Visibility
#### Group tweet texts by each user

In [5]:
cursor1 = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            
            "tweets" : {"$push": "$text"}
        }
        }
    ]
)
df1 = pd.json_normalize(cursor1)

In [6]:
df1

Unnamed: 0,_id,tweets
0,14877969,[แวนในชุดนักเตะ รับ Worldcup2018 อย่างเท่เลยฮ่...
1,2494785421,[RT @Reuters: Female Saudi flag bearers to mis...
2,758966916593496064,[Jeez!!! Today feels different..kinda having ...
3,1140595902,[Its less than 1hr to the #WorldCup Opening ce...
4,899575572732551168,[RT @sergipinkman: Proclames unionistes per an...
...,...,...
267462,144278441,"[Hoe gaaf is dit!! Wij, medewerkers van #jumbo..."
267463,356977475,[RT @FIFAWorldCup: The #WorldCup starts today ...
267464,235208639,"[RT @RoundAndWhite: It's #WorldCup day, let th..."
267465,2406883352,[#MiCorazónGritaPor que hoy arrancó la fiesta ...


### Calculate visibility

In [7]:
def calcul_visibility(tweets):
    s = 0
    for tweet in tweets:
        s += tweet.count("@")*11.4 
        s += tweet.count("#")*11.6
    return s/(140*len(tweets))

In [8]:
visibilities = []
for texts in df1.iloc[:,1]:
    visibilities.append(calcul_visibility(texts))

### favorites_count / statues_count

In [37]:
avg_fav = df.favourites_count/df.statuses_count

### followers_count / friends_count

In [33]:
r_fri_follow = df.followers_count/df.friends_count

In [32]:
r_fri_follow.to_csv("./r_fri_follow.csv")

### Frequency of tweets published since the creation of the account with t0=01/01/2019
$ I_1 = \frac{N_t}{t_0-t} $

In [34]:
import time
def ratio(n_tweets, date):
    time_array = time.strptime(date, "%a %b %d %H:%M:%S +0000 %Y")
    time_stamp = time.mktime(time_array)
    time_stamp_0 = time.mktime(time.strptime("Mon Jan 1 00:00:00 2019", '%a %b %d %H:%M:%S %Y'))
    return n_tweets/(time_stamp_0 - time_stamp)

In [35]:
frequency = []
for index, row in df.iterrows():
    n_tweet = row["statuses_count"]
    created_date = row["created_time"]
    frequency.append(ratio(n_tweet, created_date)*100)

### Generate a dataframe with 4 features

In [38]:
features = pd.DataFrame()
features["vis"] = visibilities
features["r_fri_follow"] = r_fri_follow
features["avg_fav"] = avg_fav
features["frequency"] = frequency

In [41]:
features.isnull().sum()

vis             0
r_fri_follow    0
avg_fav         0
frequency       0
dtype: int64

In [40]:
features.dropna(inplace=True)

In [42]:
features.to_csv("./features.csv")

## Export features to .csv file

In [98]:
features.to_csv("./features.csv")