### Import Modules

In [219]:
import pandas as pd
import numpy as np
import datetime
import pickle as pkl

### Reduce Memory Usage

In [197]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if (props[col].dtype != object and props[col].dtype != 'datetime64[ns]'):  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

### Load Data

In [225]:
tweet = pd.read_csv('Data/tweets.csv')

In [193]:
#Creating a timestamp for each tweet in order to be able to make an hourly count
def createtimestamp(df):
    """
    Creating a timestamp for each tweet in order to be able to make an hourly count
    """
    zeros = np.zeros(len(df))
    df['hours'] = zeros
    for i in range(len(df)):
        x = datetime.datetime.strptime(df.iloc[i].timestamp, '%Y-%m-%d %H:%M:%S')
        timestamp = pd.Timestamp(year = x.year, month = x.month, day = x.day, hour = x.hour)
        df.hours.iloc[i] = timestamp
    return df

In [176]:
df = createtimestamp(df)
df = reduce_mem_usage(df)
pkl.dump(df, open('timestamped', 'wb'))

In [194]:
#Count the tweets happening at certain hours
def hourlytweets(df):
    """
    Count the tweets happening at certain hours
    """
    df_count = df.groupby(['hours'])['tweet-id'].count()
    d = df_count.to_dict()
    df['activity'] = df['hours'].map(d)
    return df

In [None]:
df = hourlytweets(df)

In [None]:
df.info()

In [256]:
#Delete duplicates
def deleteretweets(df):
    df = tweet.drop_duplicates(subset = 'text', inplace = False)
    df = df[~df.text.str.contains('RT')]
    return df

In [218]:
df_sin = deleteretweets(df)

Unnamed: 0,user,fullname,tweet-id,timestamp,url,likes,replies,retweets,text,html,hours,activity
0,julia_vaingurt,Julia Vaingurt,746855460402302976,2016-06-25 23:59:52,/julia_vaingurt/status/746855460402302976,0,0,0,Bitcoin: Pissed-off customers sue GAW Miners i...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
1,haj1106,hunter,746855393695924224,2016-06-25 23:59:36,/haj1106/status/746855393695924224,0,0,0,I'm not gone lie I be staring at pplwhat btc's...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
2,rimbit,Rimbit ➡,746855315589652480,2016-06-25 23:59:17,/rimbit/status/746855315589652480,0,0,0,Did you miss Beyond Bitcoin Hangout? I CANNOT ...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
3,coinstories,Coin Stories,746855306026680321,2016-06-25 23:59:15,/coinstories/status/746855306026680321,0,0,0,What are the merits and risks of investing in ...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
4,bitcointonic,Julia Edling,746855274816798720,2016-06-25 23:59:08,/bitcointonic/status/746855274816798720,1,0,1,Hash Ocean #Bitcoin Cloud Mining №1 in the wor...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
5,onlineplatforms,Investors Europe,746855266210156545,2016-06-25 23:59:06,/onlineplatforms/status/746855266210156545,0,0,0,Tumultuous Week Fuels 30% Gain in Bitcoin and ...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
6,coindesk,CoinDesk,746855253518188545,2016-06-25 23:59:02,/coindesk/status/746855253518188545,12,0,41,The latest Bitcoin Price Index is 665.60 USD h...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
7,Henotsuke,Ivanov Vera,746855196186251264,2016-06-25 23:58:49,/Henotsuke/status/746855196186251264,0,0,0,http://gameofcoins.org/register/ncLLpdPQ0Ojj …...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
8,BitcoinCL,CL Bitcoin Sales,746855008163889152,2016-06-25 23:58:04,/BitcoinCL/status/746855008163889152,0,0,0,http://club.nocroom.com < #bitcoinclub #washi...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554
9,Gamerholic,Gamerholic,746854937338871808,2016-06-25 23:57:47,/Gamerholic/status/746854937338871808,0,0,0,"Gamerholic, WIN FOR A LIVING.. win #bitcoin or...","<p class=""TweetTextSize js-tweet-text tweet-te...",2016-06-25 23:00:00,554


In [253]:
text = tweet1['text'].reset_index()
text.dropna(inplace = True)
text1 = text.drop_duplicates()
text1[text1['text'].str.contains('rt')]

Unnamed: 0,index,text
10,10,Bitcoin Goes Official On New York Stock Exchan...
41,41,Bitcoin Price Watch; Finally Some Support? htt...
52,52,#Bitcoin Exchange #Coinbase Adds #Paypal Suppo...
65,65,Why Bitcoin users love Brexit - http://buff.ly...
69,69,Bitcoin Price Watch; Finally Some Support? htt...
84,84,"Understanding the Lightning Network, Part 3: C..."
87,87,#Start using #cryptocurrency with us right now...
88,88,"Ned Scott @CertainAssets, co-founder of @Steem..."
93,93,BTCTurk 2072.2 TL BTCe 636.602 $ CampBx $ Bit...
99,99,New Economy Movement Market Report: XEM/BTC Up...
