# Build Selected Sentiment Model
Assume the best model to be LogisticRegression, rebuild model to use on our data based on the above score

In [6]:
def sentiment_model():
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from sklearn.feature_extraction.text import TfidfVectorizer

    #traning data
    sentiment_data_df = pd.read_csv('../data/9.Sentiment Training 1.csv',encoding='latin-1', names=['target','ids','date','flag','user','text'])
    sentiment_data_df = sentiment_data_df.sample(frac=0.1, random_state=0) #Train with fraction of the file for faster processing
    sentiment_data_df['target']=sentiment_data_df['target'].replace(4, 1) #positive sentiment is recored as 1 now instead of 4
    sentiment_data_df['target']=sentiment_data_df['target'].replace(0, -1) #negative sentiment is recorded as -1 now instead of 0
    
    X_train, X_test, y_train, y_test = train_test_split(sentiment_data_df['text'], sentiment_data_df['target'], random_state=0)

    #vectorize traning data
    sentiment_vect = TfidfVectorizer(min_df=5,ngram_range=(1,2)).fit(X_train)
    X_train_vectorized = sentiment_vect.transform(X_train)

    #the model
    sentiment_model = LogisticRegression(C=1).fit(X_train_vectorized, y_train)

    #evaluate the model
    sentiment_predictions = sentiment_model.predict(sentiment_vect.transform(X_test))
    print('AUC: ', roc_auc_score(y_test, sentiment_predictions))
    
    return sentiment_vect, sentiment_model

sentiment_vect, sentiment_model=sentiment_model()

AUC:  0.8007967867367914


# Predict on tweet data
    - predict sentiment, using both Vader and the built model
    - record to file

In [20]:
def sentiment_predict(file): #print average sentiment of all docs
    import pandas as pd
    df=pd.read_csv(file)
    
    # get sentiment using the built model
    df['builtModel_sentiment']=sentiment_model.predict(sentiment_vect.transform(df.text.astype(str)))

    # get sentiment using nltk vader
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    df['vader_score']=''
    for i in range(0,len(df)):
        df.vader_score[i] = SentimentIntensityAnalyzer().polarity_scores(str(df.text[i]))['compound']
            
    df.to_csv('../data/9.pulledTweetsSentimentResult.csv')

    print('Built model average sentiment:',df['builtModel_sentiment'].mean())
    print('NLTK Vader average sentiment:',df['vader_score'].mean())


In [21]:
import pandas as pd
sentiment_predict('../data/5.pulledTweet-deduplicated.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Built model average sentiment: 0.5178571428571429
NLTK Vader average sentiment: 0.20716741071428574
