In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support as score

### Step #1 Load the Data

In [20]:
data = pd.read_csv("all-data.csv") 

In [21]:
data

Unnamed: 0,sentiment,sentence
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


## Step #2 Clean and Preprocess the Data


In [22]:
# Define Class Integer Values
cleanup_nums = {"sentiment":{"negative": 1, "neutral": 2, "positive": 3}}

In [33]:
# Replace the Classes with Integer Values
data.replace(cleanup_nums, inplace=True)
data

Unnamed: 0,sentiment,sentence
0,2,"According to Gran , the company has no plans t..."
1,2,Technopolis plans to develop in stages an area...
2,1,The international electronic industry company ...
3,3,With the new production plant the company woul...
4,3,According to the company 's updated strategy f...
...,...,...
4841,1,LONDON MarketWatch -- Share prices ended lower...
4842,2,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,1,Operating profit fell to EUR 35.4 mn from EUR ...
4844,1,Net sales of the Paper segment decreased to EU...


### Step #3 Train a Logistic Regression Classifier


In [35]:
#split the data to train and test 
X= data['sentence'] 
y= data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [36]:
# Create a transformation pipeline
# The pipeline sequentially applies a list of transforms and as a final estimator logistic regression 
pipeline_log = Pipeline([
                ('count', CountVectorizer()),#Convert a collection of text documents to a matrix of token counts.
                ('tfidf', TfidfTransformer()),#Transform a count matrix to a normalized tf or tf-idf representation.
                ('clf', LogisticRegression(solver='saga', multi_class='auto')),
        ])

In [37]:
# Train model using the created sklearn pipeline
model_lgr = pipeline_log.fit(X_train,  y_train)

In [38]:
y_pred = model_lgr.predict(X_test)
y_pred

array([2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 3, 2, 2, 3, 3, 2, 1, 2,
       2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
       2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2,
       2, 2, 1, 2, 3, 3, 2, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1, 1, 2, 2, 3, 2, 3, 2, 2, 1,
       2, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 3, 2, 1, 2, 2, 2, 3, 2, 2, 2, 3,
       2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3,
       2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1, 3, 2, 3,
       3, 1, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2,
       2, 2, 2, 3, 1, 2, 3, 2, 3, 2, 2, 2, 2, 2, 3, 3, 2, 2, 3, 2, 2, 2,
       2, 3, 2, 1, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 3,
       1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3,

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.82      0.49      0.61       104
           2       0.76      0.93      0.84       593
           3       0.72      0.49      0.59       273

    accuracy                           0.76       970
   macro avg       0.77      0.64      0.68       970
weighted avg       0.76      0.76      0.74       970



After doing logistc regression classifer: we can see that the data here is imbalnace theres is less samples for nutral class and less number of sample for positive class. 

Step #4 Make Test Predictions

In [47]:
testphrases = ['nice weather!', 'small bird']
for testphrase in testphrases:
    resultx = model_lgr.predict([testphrase]) 
    dict = {1: 'negative', 2: 'neutral', 3: 'positive'}
    print(testphrase + ' -----> ' + dict[resultx[0]])

nice weather! -----> neutral
small bird -----> neutral


findings: the accuracy of the model need to be increase since the model could it predicate the first sentance correctly. 