In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import re

In [2]:
run = "selected" # before_selection

## Overview

This is the entry point for the python code, this notebook 

- loads the feature vector
- sets column names
- formats timestamp into `DateTime` as sets as index
- exports to `feature_vec.csv`



## Feature Vector in

> Currently importing from a .csv file saved from the Play run, API code for play is at the bottom but not implemented yet

In [3]:
# Set the column names
feature_vector_keys = ["timestamp", "tweet_id", "positive_sentiment", "negative_sentiment", \
                       "numb_of_mentions", "numb_of_urls", "numb_of_hashtags" \
                       "numb_of_personal_pronouns", "numb_of_present_tenses", "numb_of_past_tenses", \
                       "sent_from_web", "numb_of_weird_chars", "numb_of_questions", "numb_of_emoticons", \
                       "numb_of_swearing_words", "here?", "numb_of_slang_words", "numb_of_intensifiers", \
                       "tweet_length", "userFollowersCount","userFriendsCount", "user_numb_of_tweets",\
                       "user_list_count", "tfidf_fire", "dict_precision", "dict_recall", "dict_f_measure"
                       ]





fV = pd.read_csv('../../' + run + '.txt', sep=" ", header=None)

df = fV
df.columns = feature_vector_keys

In [4]:
df= df.groupby(['tweet_id']).agg('first')
df.reset_index(level=0, inplace=True)
#df2.describe()

In [5]:
# Dropping currently empty rows
df = df.drop(['numb_of_mentions',
              'numb_of_urls',
              'numb_of_present_tenses',
              'numb_of_past_tenses',
              'numb_of_slang_words',
              'numb_of_intensifiers',
              'numb_of_emoticons',
              'numb_of_weird_chars',
              'user_numb_of_tweets',
              'tfidf_fire',
              'numb_of_hashtagsnumb_of_personal_pronouns',
              'sent_from_web',
              'numb_of_swearing_words',
              'here?'],axis =1)
#df

# Timestamp

Timestamp comes in as a float, can be converted to TimeStamp and set to index using this method

In [6]:

def trim(df):
    
    df['timestamp'] = df['timestamp'].str[1:]
    df['timestamp'] = df['timestamp'].str[:-1]
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    
def timestamp_format(df):
    # Convert the timestamp to a DateTime format
    df["timestamp"] = pd.to_numeric(df["timestamp"])
    df['timestamp'] = pd.to_datetime(df['timestamp'],unit='ms')

    # Changes the decimal setting for float
    
    #pd.set_option('display.float_format', lambda x: '%.3f' % x)

    # Set the DateTime to the pandas index
    df = df.set_index('timestamp')


trim(df)

#DateTime will break ML 
#timestamp_format(df)

df

Unnamed: 0,tweet_id,timestamp,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,"1.15567913694323098E18,",1.564370352E12,"2.0,","63.0,","19.0,","100.0,","14500.0,","1370.0,","0.0,","0.0,","0.0,",0.0]
1,"1.15571245811337216E18,",1.564378296E12,"4.0,","58.0,","15.0,","74.0,","70.0,","703.0,","1.0,","0.0,","0.0,",0.0]
2,"1.15579045747096781E18,",1.564396893E12,"7.0,","50.0,","43.0,","230.0,","3726.0,","1013.0,","799.0,","0.0,","0.0,",0.0]
3,"1.15581041559670374E18,",1.564401651E12,"2.0,","63.0,","18.0,","99.0,","67928.0,","39311.0,","1330.0,","0.0,","0.0,",0.0]
4,"1.15584734602825728E18,",1.564410456E12,"6.0,","53.0,","46.0,","237.0,","1832.0,","3371.0,","19.0,","0.0,","0.0,",0.0]
...,...,...,...,...,...,...,...,...,...,...,...,...
276,"1.15874599126635315E18,",1.565101547E12,"6.0,","49.0,","10.0,","58.0,","2.0,","16.0,","0.0,","14.289999961853027,","11.109999656677246,",12.5]
277,"1.15874705264738714E18,",1.5651018E12,"1.0,","60.0,","33.0,","140.0,","788559.0,","40.0,","11001.0,","10.0,","11.109999656677246,",10.529999732971191]
278,"1.15875348814021427E18,",1.565103334E12,"7.0,","47.0,","15.0,","80.0,","46.0,","5.0,","1.0,","22.219999313354492,","22.219999313354492,",22.219999313354492]
279,"1.1587591256033239E18,",1.565104678E12,"3.0,","65.0,","8.0,","57.0,","550.0,","720.0,","7.0,","14.289999961853027,","11.109999656677246,",12.5]


In [7]:
df = df.drop(['timestamp'],axis =1)

In [8]:
df['tweet_id'] = df['tweet_id'].str[:-1]

In [9]:
#df.tweet_id.map(lambda x: '{:.0f}'.format(x))
#df.tweet_id = df.tweet_id.map(lambda x: '{:.0f}'.format(x))
df['tweet_id'] = pd.to_numeric(df['tweet_id'].apply(lambda x: re.sub(',', '.', str(x))))
df

Unnamed: 0,tweet_id,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,1.155679e+18,"2.0,","63.0,","19.0,","100.0,","14500.0,","1370.0,","0.0,","0.0,","0.0,",0.0]
1,1.155712e+18,"4.0,","58.0,","15.0,","74.0,","70.0,","703.0,","1.0,","0.0,","0.0,",0.0]
2,1.155790e+18,"7.0,","50.0,","43.0,","230.0,","3726.0,","1013.0,","799.0,","0.0,","0.0,",0.0]
3,1.155810e+18,"2.0,","63.0,","18.0,","99.0,","67928.0,","39311.0,","1330.0,","0.0,","0.0,",0.0]
4,1.155847e+18,"6.0,","53.0,","46.0,","237.0,","1832.0,","3371.0,","19.0,","0.0,","0.0,",0.0]
...,...,...,...,...,...,...,...,...,...,...,...
276,1.158746e+18,"6.0,","49.0,","10.0,","58.0,","2.0,","16.0,","0.0,","14.289999961853027,","11.109999656677246,",12.5]
277,1.158747e+18,"1.0,","60.0,","33.0,","140.0,","788559.0,","40.0,","11001.0,","10.0,","11.109999656677246,",10.529999732971191]
278,1.158753e+18,"7.0,","47.0,","15.0,","80.0,","46.0,","5.0,","1.0,","22.219999313354492,","22.219999313354492,",22.219999313354492]
279,1.158759e+18,"3.0,","65.0,","8.0,","57.0,","550.0,","720.0,","7.0,","14.289999961853027,","11.109999656677246,",12.5]


In [10]:
df

Unnamed: 0,tweet_id,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,1.155679e+18,"2.0,","63.0,","19.0,","100.0,","14500.0,","1370.0,","0.0,","0.0,","0.0,",0.0]
1,1.155712e+18,"4.0,","58.0,","15.0,","74.0,","70.0,","703.0,","1.0,","0.0,","0.0,",0.0]
2,1.155790e+18,"7.0,","50.0,","43.0,","230.0,","3726.0,","1013.0,","799.0,","0.0,","0.0,",0.0]
3,1.155810e+18,"2.0,","63.0,","18.0,","99.0,","67928.0,","39311.0,","1330.0,","0.0,","0.0,",0.0]
4,1.155847e+18,"6.0,","53.0,","46.0,","237.0,","1832.0,","3371.0,","19.0,","0.0,","0.0,",0.0]
...,...,...,...,...,...,...,...,...,...,...,...
276,1.158746e+18,"6.0,","49.0,","10.0,","58.0,","2.0,","16.0,","0.0,","14.289999961853027,","11.109999656677246,",12.5]
277,1.158747e+18,"1.0,","60.0,","33.0,","140.0,","788559.0,","40.0,","11001.0,","10.0,","11.109999656677246,",10.529999732971191]
278,1.158753e+18,"7.0,","47.0,","15.0,","80.0,","46.0,","5.0,","1.0,","22.219999313354492,","22.219999313354492,",22.219999313354492]
279,1.158759e+18,"3.0,","65.0,","8.0,","57.0,","550.0,","720.0,","7.0,","14.289999961853027,","11.109999656677246,",12.5]


# I/O

In [11]:
def export_df(df):
    # Remove Unnamed Columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    #df.astype(int)
    df = df.replace(r'\D+', '', regex=True)
    df.round()
    # Export
    df.to_csv(run + ".csv", index=True)

#pd.set_option('precision', 100)
export_df(df)

In [12]:
def import_df(df):
    # Import
    df = pd.read_csv(run + ".csv")

    # Timestamp will need to be reconfigured as index on each load
    df = df.set_index('timestamp')
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df
#import_df()

df

Unnamed: 0,tweet_id,positive_sentiment,negative_sentiment,numb_of_questions,tweet_length,userFollowersCount,userFriendsCount,user_list_count,dict_precision,dict_recall,dict_f_measure
0,1.155679e+18,"2.0,","63.0,","19.0,","100.0,","14500.0,","1370.0,","0.0,","0.0,","0.0,",0.0]
1,1.155712e+18,"4.0,","58.0,","15.0,","74.0,","70.0,","703.0,","1.0,","0.0,","0.0,",0.0]
2,1.155790e+18,"7.0,","50.0,","43.0,","230.0,","3726.0,","1013.0,","799.0,","0.0,","0.0,",0.0]
3,1.155810e+18,"2.0,","63.0,","18.0,","99.0,","67928.0,","39311.0,","1330.0,","0.0,","0.0,",0.0]
4,1.155847e+18,"6.0,","53.0,","46.0,","237.0,","1832.0,","3371.0,","19.0,","0.0,","0.0,",0.0]
...,...,...,...,...,...,...,...,...,...,...,...
276,1.158746e+18,"6.0,","49.0,","10.0,","58.0,","2.0,","16.0,","0.0,","14.289999961853027,","11.109999656677246,",12.5]
277,1.158747e+18,"1.0,","60.0,","33.0,","140.0,","788559.0,","40.0,","11001.0,","10.0,","11.109999656677246,",10.529999732971191]
278,1.158753e+18,"7.0,","47.0,","15.0,","80.0,","46.0,","5.0,","1.0,","22.219999313354492,","22.219999313354492,",22.219999313354492]
279,1.158759e+18,"3.0,","65.0,","8.0,","57.0,","550.0,","720.0,","7.0,","14.289999961853027,","11.109999656677246,",12.5]


## Play API

In [13]:
## API
'''
import http.client

conn = http.client.HTTPConnection("localhost:9000")
conn.request("GET", "/vec")
res = conn.getresponse()
print(res.status, res.reason)

conn.close()
'''

'\nimport http.client\n\nconn = http.client.HTTPConnection("localhost:9000")\nconn.request("GET", "/vec")\nres = conn.getresponse()\nprint(res.status, res.reason)\n\nconn.close()\n'