In [23]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from verstack.stratified_continuous_split import scsplit # pip install verstack

import sys
import pickle
from pickle import dump
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import seaborn as sns
import joblib
from sklearn.utils import shuffle

In [25]:
def get_data(training=50000,testing=30000):
    pickel_in = open("data/train_data_preprocessed.csv", "rb")
    train_data_prepro = pickle.load(pickel_in)

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)

    sns.set(context="paper")

    X_train, X_test, y_train, y_test = scsplit(train_data_prepro, train_data_prepro['retweet_count'], stratify=train_data_prepro['retweet_count'], train_size=0.7, test_size=0.3)

    X_train = X_train.head(training)
    X_test = X_test.head(testing)
    y_train = y_train.head(training)
    y_test = y_test.head(testing)

    # We remove the actual number of retweets from our features since it is the value that we are trying to predict
    X_train = X_train.drop(['retweet_count'], axis=1)
    X_test = X_test.drop(['retweet_count'], axis=1)

    num_attribs = list(train_data_prepro[["user_verified", "timestamp_transf_hour", "timestamp_transf_weekday", "hashtags_count", "user_statuses_count", "user_followers_count", "user_friends_count"]])
    text_attribs = "text"
    bin_counting_nominal_cat_attribs = "hashtags_transf"


    num_pipe = Pipeline([('std_scaler', StandardScaler())])
    text_pipe = Pipeline([('tfidf_vect', TfidfVectorizer(max_features=100, stop_words='english'))])
    bin_counting_nominal_cat_pipe = Pipeline([('count_vect', CountVectorizer(max_features=20))])

    full_pipe = ColumnTransformer([
        ('num', num_pipe, num_attribs),
        ('text', text_pipe, text_attribs),
        ('bin_counting', bin_counting_nominal_cat_pipe, bin_counting_nominal_cat_attribs),
    ])

    X_train = full_pipe.fit_transform(X_train)
    X_test = full_pipe.transform(X_test)


    print("SHAPE OF X_train", X_train.shape)
    print("type(X_train) = ", type(X_train))
    print("-----------------------------------")
    return X_train, X_test, y_train, y_test


def train(X_train,y_train):
#     print("Linear Regressor")
#     print("Lasso Regressor")

    # Logistic Regression
    print("LogisticRegression")
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    return log_reg
    
def predict(log_reg, print_features = False):  
    pred_log_reg_train = log_reg.predict(X_train)
    pred_log_reg_test = log_reg.predict(X_test)
    log_reg_train_mae = mean_absolute_error(y_true=y_train, y_pred=pred_log_reg_train)
    log_reg_test_mae = mean_absolute_error(y_true=y_test, y_pred=pred_log_reg_test)
    print("Logistic Regression prediction error for training set: {} and for testing set: {}".format(log_reg_train_mae, log_reg_test_mae))
    
    if print_features:
        #importances = log_reg.feature_importances_
        importance = log_reg.coef_[0]
        for i,v in enumerate(importance):
            print('Feature: %0d, Score: %.5f' % (i,v))
        # plot feature importance
        plt.bar([x for x in range(len(importance))], importance)
        plt.show()

In [38]:
X_train, X_test, y_train, y_test = get_data(50000,100000)

SHAPE OF X_train (50000, 127)
type(X_train) =  <class 'scipy.sparse.csr.csr_matrix'>
-----------------------------------


In [39]:
model = train(X_train,y_train)

LogisticRegression


KeyboardInterrupt: 

In [None]:
predict(model,True)

In [None]:
sys.exit()
idxs = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(idxs)), importances[idxs], align='center')
plt.yticks(range(len(idxs)), [col_names[i] for i in idxs])
plt.xlabel('Logistic Regression Feature Importance')
plt.show()