In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import re

In [2]:
run = "before_selection" # before_selection

## Overview

This is the entry point for the python code, this notebook 

- loads the feature vector
- sets column names
- formats timestamp into `DateTime` as sets as index
- exports to `feature_vec.csv`



## Feature Vector in

> Currently importing from a .csv file saved from the Play run, API code for play is at the bottom but not implemented yet

In [4]:
# Set the column names
feature_vector_keys = ["timestamp", "tweet_id", "positive_sentiment", "negative_sentiment", \
                       "numb_of_mentions", "numb_of_urls", "numb_of_hashtags" \
                       "numb_of_personal_pronouns", "numb_of_present_tenses", "numb_of_past_tenses", \
                       "sent_from_web", "numb_of_weird_chars", "numb_of_questions", "numb_of_emoticons", \
                       "numb_of_swearing_words", "here?", "numb_of_slang_words", "numb_of_intensifiers", \
                       "tweet_length", "userFollowersCount","userFriendsCount", "user_numb_of_tweets",\
                       "user_list_count", "tfidf_fire", "dict_precision", "dict_recall", "dict_f_measure"
                       ]

fV = pd.read_csv('../1-input-data/' + run + '.txt', sep=" ", header=None)

df = fV
df.columns = feature_vector_keys

In [5]:
df= df.groupby(['tweet_id']).agg('first')
df.reset_index(level=0, inplace=True)
#df2.describe()

In [6]:
# Dropping currently empty rows
df = df.drop(['numb_of_mentions',
              'numb_of_urls',
              'numb_of_present_tenses',
              'numb_of_past_tenses',
              'numb_of_slang_words',
              'numb_of_intensifiers',
              'numb_of_emoticons',
              'numb_of_weird_chars',
              'user_numb_of_tweets',
              'tfidf_fire',
              'numb_of_hashtagsnumb_of_personal_pronouns',
              'sent_from_web',
              'numb_of_swearing_words',
              'here?'],axis =1)
df

Unnamed: 0,tweet_id,timestamp,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,"1.15548485241168691E18,","[1.564324031E12,","2.0,","66.0,","15.0,","99.0,","18.0,","42.0,","0.0,","11.109999656677246,","11.109999656677246,",11.109999656677246]
1,"1.15548673102360576E18,","[1.564324479E12,","1.0,","39.0,","52.0,","257.0,","7709.0,","2814.0,","392.0,","0.0,","0.0,",0.0]
2,"1.15548843001874432E18,","[1.564324884E12,","2.0,","63.0,","17.0,","98.0,","537.0,","1486.0,","20.0,","0.0,","0.0,",0.0]
3,"1.15549353099270554E18,","[1.5643261E12,","4.0,","57.0,","63.0,","257.0,","295.0,","2145.0,","2.0,","10.0,","11.109999656677246,",10.529999732971191]
4,"1.15549548994669773E18,","[1.564326567E12,","15.0,","42.0,","40.0,","217.0,","1682.0,","104.0,","51.0,","0.0,","0.0,",0.0]
...,...,...,...,...,...,...,...,...,...,...,...,...
2944,"1.1587591256033239E18,","[1.565104678E12,","3.0,","65.0,","8.0,","57.0,","550.0,","720.0,","7.0,","14.289999961853027,","11.109999656677246,",12.5]
2945,"1.15875932609122714E18,","[1.565104726E12,","7.0,","47.0,","15.0,","80.0,","106.0,","124.0,","0.0,","22.219999313354492,","22.219999313354492,",22.219999313354492]
2946,"1.15875993376617677E18,","[1.565104871E12,","5.0,","37.0,","12.0,","57.0,","220.0,","61.0,","3.0,","33.33000183105469,","22.219999313354492,",26.670000076293945]
2947,"1.15876167494759629E18,","[1.565105286E12,","9.0,","28.0,","8.0,","70.0,","33.0,","0.0,","4.0,","16.670000076293945,","11.109999656677246,",13.329999923706055]


# Timestamp

Timestamp comes in as a float, can be converted to TimeStamp and set to index using this method

In [7]:
def trim(df):
    
    df['timestamp'] = df['timestamp'].str[1:]
    df['timestamp'] = df['timestamp'].str[:-1]
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    
def timestamp_format(df):
    # Convert the timestamp to a DateTime format
    df["timestamp"] = pd.to_numeric(df["timestamp"])
    df['timestamp'] = pd.to_datetime(df['timestamp'],unit='ms')

    # Changes the decimal setting for float
    
    #pd.set_option('display.float_format', lambda x: '%.3f' % x)

    # Set the DateTime to the pandas index
    #df = df.set_index('timestamp')


trim(df)

#DateTime will break ML 
#timestamp_format(df)

df

Unnamed: 0,tweet_id,timestamp,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,"1.15548485241168691E18,",1.564324031E12,"2.0,","66.0,","15.0,","99.0,","18.0,","42.0,","0.0,","11.109999656677246,","11.109999656677246,",11.109999656677246]
1,"1.15548673102360576E18,",1.564324479E12,"1.0,","39.0,","52.0,","257.0,","7709.0,","2814.0,","392.0,","0.0,","0.0,",0.0]
2,"1.15548843001874432E18,",1.564324884E12,"2.0,","63.0,","17.0,","98.0,","537.0,","1486.0,","20.0,","0.0,","0.0,",0.0]
3,"1.15549353099270554E18,",1.5643261E12,"4.0,","57.0,","63.0,","257.0,","295.0,","2145.0,","2.0,","10.0,","11.109999656677246,",10.529999732971191]
4,"1.15549548994669773E18,",1.564326567E12,"15.0,","42.0,","40.0,","217.0,","1682.0,","104.0,","51.0,","0.0,","0.0,",0.0]
...,...,...,...,...,...,...,...,...,...,...,...,...
2944,"1.1587591256033239E18,",1.565104678E12,"3.0,","65.0,","8.0,","57.0,","550.0,","720.0,","7.0,","14.289999961853027,","11.109999656677246,",12.5]
2945,"1.15875932609122714E18,",1.565104726E12,"7.0,","47.0,","15.0,","80.0,","106.0,","124.0,","0.0,","22.219999313354492,","22.219999313354492,",22.219999313354492]
2946,"1.15875993376617677E18,",1.565104871E12,"5.0,","37.0,","12.0,","57.0,","220.0,","61.0,","3.0,","33.33000183105469,","22.219999313354492,",26.670000076293945]
2947,"1.15876167494759629E18,",1.565105286E12,"9.0,","28.0,","8.0,","70.0,","33.0,","0.0,","4.0,","16.670000076293945,","11.109999656677246,",13.329999923706055]


In [8]:
df = df.drop(['timestamp'],axis =1)

In [9]:
df['tweet_id'] = df['tweet_id'].str[:-1]
df['positive_sentiment'] = df['positive_sentiment'].str[:-1]
df['negative_sentiment'] = df['negative_sentiment'].str[:-1]
df['numb_of_questions'] = df['numb_of_questions'].str[:-1]
df['tweet_length'] = df['tweet_length'].str[:-1]
df['userFollowersCount'] = df['userFollowersCount'].str[:-1]
df['userFriendsCount'] = df['userFriendsCount'].str[:-1]
df['user_list_count'] = df['user_list_count'].str[:-1]
df['dict_precision'] = df['dict_precision'].str[:-1]
df['dict_recall'] = df['dict_recall'].str[:-1]
df['dict_f_measure'] = df['dict_f_measure'].str[:-1]

In [10]:
#df.tweet_id.map(lambda x: '{:.0f}'.format(x))
#df.tweet_id = df.tweet_id.map(lambda x: '{:.0f}'.format(x))
df['tweet_id'] = pd.to_numeric(df['tweet_id'].apply(lambda x: re.sub(',', '.', str(x))))
df['tweet_id'].describe()
df

Unnamed: 0,tweet_id,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,1.155485e+18,2.0,66.0,15.0,99.0,18.0,42.0,0.0,11.109999656677246,11.109999656677246,11.109999656677246
1,1.155487e+18,1.0,39.0,52.0,257.0,7709.0,2814.0,392.0,0.0,0.0,0.0
2,1.155488e+18,2.0,63.0,17.0,98.0,537.0,1486.0,20.0,0.0,0.0,0.0
3,1.155494e+18,4.0,57.0,63.0,257.0,295.0,2145.0,2.0,10.0,11.109999656677246,10.529999732971191
4,1.155495e+18,15.0,42.0,40.0,217.0,1682.0,104.0,51.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2944,1.158759e+18,3.0,65.0,8.0,57.0,550.0,720.0,7.0,14.289999961853027,11.109999656677246,12.5
2945,1.158759e+18,7.0,47.0,15.0,80.0,106.0,124.0,0.0,22.219999313354492,22.219999313354492,22.219999313354492
2946,1.158760e+18,5.0,37.0,12.0,57.0,220.0,61.0,3.0,33.33000183105469,22.219999313354492,26.670000076293945
2947,1.158762e+18,9.0,28.0,8.0,70.0,33.0,0.0,4.0,16.670000076293945,11.109999656677246,13.329999923706055


In [11]:
# Obj -> Str -> float
pd.set_option('display.float_format', '{:.2f}'.format)

for col in df.columns.values:
    print(col)
    df[col] = df[col].astype(str).astype(float)#.astype('int64')
    
float_col = df.select_dtypes(include=['float64'])



for col in float_col.columns.values:
    df[col] = df[col].astype('int64')



#df.astype(float)

display(df.dtypes) 

tweet_id
positive_sentiment
negative_sentiment
numb_of_questions
tweet_length
userFollowersCount
userFriendsCount
user_list_count
dict_precision
dict_recall
dict_f_measure


tweet_id              int64
positive_sentiment    int64
negative_sentiment    int64
numb_of_questions     int64
tweet_length          int64
userFollowersCount    int64
userFriendsCount      int64
user_list_count       int64
dict_precision        int64
dict_recall           int64
dict_f_measure        int64
dtype: object

In [12]:
df

Unnamed: 0,tweet_id,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,1155484852411686656,2,66,15,99,18,42,0,11,11,11
1,1155486731023605504,1,39,52,257,7709,2814,392,0,0,0
2,1155488430018744320,2,63,17,98,537,1486,20,0,0,0
3,1155493530992705536,4,57,63,257,295,2145,2,10,11,10
4,1155495489946697728,15,42,40,217,1682,104,51,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2944,1158759125603323904,3,65,8,57,550,720,7,14,11,12
2945,1158759326091227136,7,47,15,80,106,124,0,22,22,22
2946,1158759933766176768,5,37,12,57,220,61,3,33,22,26
2947,1158761674947596288,9,28,8,70,33,0,4,16,11,13


# I/O

In [20]:
def clean_dataset_int(df):
    #df = df_t.drop(['category','matchedName','eventType','eventID'],axis =1)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    #    df.round()
    df.replace(np.nan,0)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df.astype(int)
    df = df.replace(r'\D+', '', regex=True)
    return df[indices_to_keep].astype(np.float64)

def export_df(df):
    # Remove Unnamed Columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    #df.astype(int)
    df = df.replace(r'\D+', '', regex=True)
    df.round()
    # Export
    df.to_csv("../3-csv/" + run + ".csv", index=False)

#pd.set_option('precision', 100)
#export_df(df)
df = clean_dataset_int(df)
df.to_csv("../3-csv/" + run + ".csv", index=False)

In [21]:
def import_df():
    # Import
    df = pd.read_csv(run + ".csv")
    # Timestamp will need to be reconfigured as index on each load
    #df = df.set_index('timestamp')
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df

import_df()

df

Unnamed: 0,tweet_id,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,1155484852411686656.00,2.00,66.00,15.00,99.00,18.00,42.00,0.00,11.00,11.00,11.00
1,1155486731023605504.00,1.00,39.00,52.00,257.00,7709.00,2814.00,392.00,0.00,0.00,0.00
2,1155488430018744320.00,2.00,63.00,17.00,98.00,537.00,1486.00,20.00,0.00,0.00,0.00
3,1155493530992705536.00,4.00,57.00,63.00,257.00,295.00,2145.00,2.00,10.00,11.00,10.00
4,1155495489946697728.00,15.00,42.00,40.00,217.00,1682.00,104.00,51.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...
2944,1158759125603323904.00,3.00,65.00,8.00,57.00,550.00,720.00,7.00,14.00,11.00,12.00
2945,1158759326091227136.00,7.00,47.00,15.00,80.00,106.00,124.00,0.00,22.00,22.00,22.00
2946,1158759933766176768.00,5.00,37.00,12.00,57.00,220.00,61.00,3.00,33.00,22.00,26.00
2947,1158761674947596288.00,9.00,28.00,8.00,70.00,33.00,0.00,4.00,16.00,11.00,13.00


## Play API

In [22]:
## API
'''
import http.client

conn = http.client.HTTPConnection("localhost:9000")
conn.request("GET", "/vec")
res = conn.getresponse()
print(res.status, res.reason)

conn.close()
'''

'\nimport http.client\n\nconn = http.client.HTTPConnection("localhost:9000")\nconn.request("GET", "/vec")\nres = conn.getresponse()\nprint(res.status, res.reason)\n\nconn.close()\n'