In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Read the csv file into a pandas DataFrame
twitter_train = pd.read_csv('../Output/tweets1.csv', encoding = "ISO-8859-1")
# Drop Null values from data frame
twitter_train = twitter_train.dropna(subset=['Tokenized'])
#print(twitter_train.head())
# Use Pandas get_dummies to convert categorical data

# Assign X (data) and y (target)

### BEGIN SOLUTION
X = twitter_train[['Tokenized']]
y = twitter_train["Sentiment"].values.reshape(-1, 1)
#X_test = twitter_test[['ItemID','SentimentText']]
#y_test = twitter_test["Sentiment"].values.reshape(-1, 1)
#print(X.shape, y.shape)
#print(X_test.shape, y_test.shape)
### END SOLUTION
                   
# Create tfidf scores and fit the data to the transformer

from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=750, analyzer = 'word')  
X = tfidfconverter.fit_transform(twitter_train['Tokenized']).toarray()
print(X)

#print(X.shape, y.shape)               

# Split the data into training and testing

### BEGIN SOLUTION
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
### END SOLUTION

from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

### BEGIN SOLUTION
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)
### END SOLUTION

# Transform the training and testing data using the X_scaler and y_scaler models

### BEGIN SOLUTION
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)
### END SOLUTION


In [11]:
import logging
def extract_features(df,field,training_data,testing_data,type="binary"):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

In [35]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import logging

# Read the csv file into a pandas DataFrame
df= pd.read_csv('../Output/tweets1.csv', encoding = "ISO-8859-1")
# Drop Null values from data frame
df= df.dropna(subset=['Tokenized'])

# GET A TRAIN TEST SPLIT (set seed for consistent results)
training_data, testing_data = train_test_split(df, random_state = 2000)

# Assign X (data) and y (target)

### Get features
X_train,X_test,feature_transformer=extract_features(df,'Tokenized',training_data,testing_data,type='tfidf')
###Get Labels
y_train = training_data["Sentiment"].values
y_test = testing_data["Sentiment"].values
print(y_train)


[0 1 1 ... 1 1 1]


In [36]:
# INIT LOGISTIC REGRESSION CLASSIFIER
logging.info("Training a Logistic Regression Model...")
scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
model=scikit_log_reg.fit(X_train,y_train)
predictions = model.predict(X_test)
print(predictions)
results=pd.DataFrame({"Prediction": predictions, "Actual": y_test})
results.head()

[LibLinear][1 1 0 ... 1 1 1]


Unnamed: 0,Prediction,Actual
0,1,0
1,1,1
2,0,0
3,1,1
4,1,0


In [37]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["blue", "red"]))

              precision    recall  f1-score   support

        blue       0.70      0.62      0.66     10526
         red       0.72      0.79      0.76     13290

   micro avg       0.72      0.72      0.72     23816
   macro avg       0.71      0.71      0.71     23816
weighted avg       0.71      0.72      0.71     23816

