In [1]:
from IPython.core.magic import (register_line_magic, register_cell_magic,
                                register_line_cell_magic)
from IPython.display import display
from IPython.display import HTML

import cPickle
from dateutil import parser
from os import listdir
from os.path import isfile, join, split

import numpy as np
import pandas as pd
import pandas.io.sql as psql
import psycopg2
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sqlalchemy import create_engine

import df_to_hawq
import params

In [2]:
conn = psycopg2.connect(database=params.database,
                        host=params.host,
                        port=params.port,
                        user=params.username,
                        password=params.password)

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(params.username,
                                                            params.password,
                                                            params.host,
                                                            params.port,
                                                            params.database))
conn.autocommit = True

In [3]:
_df = None
@register_cell_magic
def showsql(line, cell):
    """
        Extract the code in the specific cell (should be valid SQL), and execute
        it using the connection object to the backend database. 
        The resulting Pandas dataframe
        is rendered inline below the cell using IPython.display.
        You'd use this for SELECT
    """
    #Use the global connection object defined above.
    global conn
    global _df
    _df = psql.read_sql(cell, conn)
    conn.commit()
    display(_df)
    return
    
@register_cell_magic
def execsql(line, cell):
    """
        Extract the code in the specific cell (should be valid SQL), and execute
        it using the connection object to the backend database. 
        You'd use this for CREATE/UPDATE/DELETE
    """
    #Use the global connection object defined above.
    global conn
    global _df
    _df = psql.execute(cell, conn)
    conn.commit()
    return

# We delete these to avoid name conflicts for automagic to work
del execsql, showsql

In [None]:
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
dftrain = pd.read_csv('stanford-sentiment-twitter-data/training.1600000.processed.noemoticon.csv',
                      header = None,
                      encoding ='ISO-8859-1')
dftest = pd.read_csv('stanford-sentiment-twitter-data/testdata.manual.2009.06.14.csv',
                     header = None,
                     encoding ='ISO-8859-1')
dftrain.columns = columns
dftest.columns = columns

## Load data to GPDB

In [6]:
dfsubset = dftrain.iloc[0:10,:]

In [7]:
df_to_hawq.df_to_hawq(dfsubset,'mdl.tweets_train',engine)

In [17]:
dftrain_export = dftrain.copy()
dftrain_export['train_set'] = 1
dftrain_export = dftrain_export[['polarity','train_set','text']]

def df_add_id_train(df,is_train):
    df.insert(0,'id',df.index.tolist())
    df.insert(1,'is_train',[is_train]*df.shape[0])
    return df
dftrain_export = df_add_id_train(dftrain_export,1)
# _d = dftrain_export.iloc[0:10000,:]
# _d.to_sql('tweets_train', engine, schema='mdl', index = False, if_exists = 'replace', chunksize=10000)
dftrain_export.to_sql('tweets_train', engine, schema='mdl', index = False, if_exists = 'replace', chunksize=10000)

## Build model

In [20]:
%%execsql

DROP FUNCTION IF EXISTS mdl.train_sentiment_model1(tweets text[], polarities bigint[]);
CREATE FUNCTION mdl.train_sentiment_model1(tweets text[], polarities bigint[])
RETURNS bytea AS $$
import pickle
import re

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

def regex_preprocess(raw_tweets):
    pp_text = pd.Series(raw_tweets)
    
    user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
    http_pat = '(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
    repeat_pat, repeat_repl = "(.)\\1\\1+",'\\1\\1'

    pp_text = pp_text.str.replace(pat = user_pat, repl = 'USERNAME')
    pp_text = pp_text.str.replace(pat = http_pat, repl = 'URL')
    pp_text.str.replace(pat = repeat_pat, repl = repeat_repl)
    return pp_text
    
sentiment_lr = Pipeline([('count_vect', CountVectorizer(min_df = 100,
                                                        ngram_range = (1,1),
                                                        stop_words = 'english')), 
                         ('lr', LogisticRegression())])

X = regex_preprocess(tweets)
sentiment_lr.fit(X, polarities)
return pickle.dumps(sentiment_lr)
$$ LANGUAGE plpythonu;

DROP TABLE IF EXISTS mdl.sentiment_model;
CREATE TABLE mdl.sentiment_model AS
SELECT mdl.train_sentiment_model1(array_agg(text),array_agg(polarity)) model
FROM
(SELECT *
FROM mdl.tweets_train
ORDER BY RANDOM()
LIMIT 1000)f;

In [25]:
%%showsql
SELECT *
FROM mdl.sentiment_model;

DROP FUNCTION IF EXISTS mdl.apply_sentiment_model1(model bytea, tweets text[]);
CREATE FUNCTION mdl.apply_sentiment_model1(model bytea, tweets text[])
RETURNS float8[] AS $$
import pickle
import re

import dill
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

def regex_preprocess(raw_tweets):
    pp_text = pd.Series(raw_tweets)
    
    user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
    http_pat = '(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
    repeat_pat, repeat_repl = "(.)\\1\\1+",'\\1\\1'

    pp_text = pp_text.str.replace(pat = user_pat, repl = 'USERNAME')
    pp_text = pp_text.str.replace(pat = http_pat, repl = 'URL')
    pp_text.str.replace(pat = repeat_pat, repl = repeat_repl)
    return pp_text

cl = pickle.loads(model)
X = regex_preprocess(tweets)
return cl.predict_proba(X)[:,1]
$$ LANGUAGE plpythonu;

SELECT tweets,mdl.apply_sentiment_model1(model,tweets)
FROM
    mdl.sentiment_model,
    (SELECT array_agg(text) tweets
    FROM
        (SELECT *
        FROM mdl.tweets_train
        WHERE random()<.0065
        LIMIT 10000
        )f
    )f1;

Unnamed: 0,tweets,apply_sentiment_model1
0,[Grrr.. my ipods acting weird too! Jai ho and ...,"[0.470217890525, 0.470217890525, 0.60053273936..."


## Try model in Python 2.7

In [26]:
%%showsql
SELECT model
FROM mdl.sentiment_model

Unnamed: 0,model
0,"[c, c, o, p, y, _, r, e, g, \n, _, r, e, c, o,..."


In [29]:
import pickle
cl = pickle.loads(str(_df.model[0]))

In [41]:
sentence = 'steph curry is a basketball player'
def regex_preprocess(raw_tweets):
    pp_text = pd.Series(raw_tweets)
    
    user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
    http_pat = '(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
    repeat_pat, repeat_repl = "(.)\\1\\1+",'\\1\\1'

    pp_text = pp_text.str.replace(pat = user_pat, repl = 'USERNAME')
    pp_text = pp_text.str.replace(pat = http_pat, repl = 'URL')
    pp_text.str.replace(pat = repeat_pat, repl = repeat_repl)
    return pp_text



0    steph curry is a basketball player
dtype: object

In [44]:
cl.predict_proba(regex_preprocess(['hello','world']))

array([[ 0.52978211,  0.47021789],
       [ 0.52978211,  0.47021789]])