# Connect to twitter

In [21]:
import tweepy
import configparser

config = configparser.ConfigParser()
config.read("keys.cfg")
consumer_secret = config.get("Twitter", "consumer_secret")
consumer_token = config.get("Twitter", "consumer_token")
token_key = config.get("Twitter", "token_key")
token_secret = config.get("Twitter", "token_secret")
auth = tweepy.OAuthHandler(consumer_token, consumer_secret)
auth.set_access_token(token_key, token_secret)
api = tweepy.API(auth)

# Building a dataset

In [41]:
import pandas as pd
import itertools 
from tweepy import Cursor as C_
topics = ["NO_TOPIC"]
cols = ['sentiment','id','date','query_string','user','text']
df = pd.DataFrame(columns=cols)
n_tweets = 100

for topic, tweet in itertools.product(topics, C_(api.search, topic, lang='en').items(n_tweets)):
    if len(tweet.text) > 3:
        vals = [-1, tweet.id, tweet.created_at, topic, tweet.user.name, tweet.text]
        df = df.append([pd.Series(vals, index=cols)], ignore_index=True)

In [None]:
df.to_csv("data/new_dataset.csv", index=False)

# Merge with existing dataset

In [6]:
import pandas as pd 
import os 

cols = ['sentiment','id','date','query_string','user','text']
new_df = pd.read_csv("data/new_dataset.csv", sep=";")
new_df = new_df[new_df['sentiment'] != -1]
train_df = pd.read_csv("data/sentiment140/training.1600000.processed.noemoticon.csv", header=None, names=cols, encoding="ISO-8859-1")
test_df = pd.read_csv("data/sentiment140/testdata.manual.2009.06.14.csv", header=None, names=cols, encoding="ISO-8859-1")
df = pd.concat([train_df, test_df, new_df])


In [19]:
df.to_csv("data/merge_dataset.csv", index=False)