In [89]:
import pandas as pd
from collections import OrderedDict

In [59]:
DATA_DIR = 'data/'
INPUT_FILE = DATA_DIR + 'all_original.xlsx'
OUTPUT_FILE = DATA_DIR + 'all_clean.csv'

# Load

In [107]:
original = pd.read_excel(INPUT_FILE)
original.head()

Unnamed: 0,Date,Time,Tweet,Client,Client Simplified
0,12/05/2016,12:00:27 PM,If the press would cover me accurately & honor...,Twitter Web Client,Twitter Web Client
1,12/05/2016,9:53:11 AM,I am thrilled to nominate Dr. @RealBenCarson a...,Twitter for iPhone,Twitter for iPhone
2,12/04/2016,6:30:22 PM,their country (the U.S. doesn't tax them) or t...,Twitter for Android,Twitter for Android
3,12/04/2016,6:23:55 PM,Did China ask us if it was OK to devalue their...,Twitter for Android,Twitter for Android
4,12/04/2016,5:47:21 PM,".@FoxNews will be re-running ""Objectified: Don...",Twitter for Android,Twitter for Android


# Clean

In [117]:
intermediate = original[['Date', 'Time', 'Client', 'Tweet']]
intermediate.columns = ['date', 'time', 'client', 'tweet']
intermediate.head()
intermediate.loc[:, 'date'] = pd.to_datetime(intermediate.date + ' ' + intermediate.time)
intermediate = intermediate.drop('time', axis=1)
intermediate.set_index('date', inplace=True, drop=True)
intermediate.head()

Unnamed: 0_level_0,client,tweet
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-12-05 12:00:27,Twitter Web Client,If the press would cover me accurately & honor...
2016-12-05 09:53:11,Twitter for iPhone,I am thrilled to nominate Dr. @RealBenCarson a...
2016-12-04 18:30:22,Twitter for Android,their country (the U.S. doesn't tax them) or t...
2016-12-04 18:23:55,Twitter for Android,Did China ask us if it was OK to devalue their...
2016-12-04 17:47:21,Twitter for Android,".@FoxNews will be re-running ""Objectified: Don..."


# Prep for Categorization

## Lakoff's Categories

1. __Preemptive Framing__: frame an issue before other people get a chance to (e.g. calling dossier fake news)
2. __Diversion__: distract from a major issue by saying something outrageous (e.g. attacking hamilton)
3. __Trial Ballooon__: test public reaction to a proposal (e.g., expanding nuclear capabilities)
4. __Deflection__: attack the messenger (e.g., BuzzFeed and CNN)
5. __Salient Exemplar__: generalize from a specific case

In [118]:
categories = OrderedDict()
categories['preemptive_framing'] = 'frame an issue before other people get a chance to (e.g. calling dossier fake news)'
categories['diversion'] = 'distract from a major issue by saying something outrageous (e.g. attacking hamilton)'
categories['trial_ballooon'] = 'test public reaction to a proposal (e.g., expanding nuclear capabilities)'
categories['deflection'] = 'attack the messenger (e.g., BuzzFeed and CNN)'
categories['salient_exemplar'] = 'generalize from a specific case'
categories

OrderedDict([('preemptive_framing',
              'frame an issue before other people get a chance to (e.g. calling dossier fake news)'),
             ('diversion',
              'distract from a major issue by saying something outrageous (e.g. attacking hamilton)'),
             ('trial_ballooon',
              'test public reaction to a proposal (e.g., expanding nuclear capabilities)'),
             ('deflection', 'attack the messenger (e.g., BuzzFeed and CNN)'),
             ('salient_exemplar', 'generalize from a specific case')])

## Add Integer Boolean Columns for Categories

In [119]:
final = intermediate
for i in categories.keys():
    final.loc[:, i] = 0 
final.head()

Unnamed: 0_level_0,client,tweet,preemptive_framing,diversion,trial_ballooon,deflection,salient_exemplar
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-12-05 12:00:27,Twitter Web Client,If the press would cover me accurately & honor...,0,0,0,0,0
2016-12-05 09:53:11,Twitter for iPhone,I am thrilled to nominate Dr. @RealBenCarson a...,0,0,0,0,0
2016-12-04 18:30:22,Twitter for Android,their country (the U.S. doesn't tax them) or t...,0,0,0,0,0
2016-12-04 18:23:55,Twitter for Android,Did China ask us if it was OK to devalue their...,0,0,0,0,0
2016-12-04 17:47:21,Twitter for Android,".@FoxNews will be re-running ""Objectified: Don...",0,0,0,0,0


# Export All

In [120]:
final.to_csv('data/all_cleaned.csv', encoding='utf-8')

# Export Each Month

For easy hand-categorizing...

In [121]:
months = final.groupby(pd.TimeGrouper('M'))

In [122]:
for timestamp, df in months:
    filename = timestamp.strftime("month-%Y-%m.csv")
    df.to_csv(DATA_DIR + filename, encoding='utf-8')