
### load data


In [25]:
import pandas as pd

# load the CSV file into a DataFrame
df = pd.read_csv('all_data.csv')

# display the first 5 rows of the DataFrame
print(df.head())

                                              review  sentiment
0                                 Aditya Ingole Deaf          2
1  I love the app.! There is no issue but if u co...          1
2  So hard to use. The web app failed, and the mo...          0
3  I hate that the app makes a sound every time s...          1
4  Useless at BSE star MF meet.voice too mych slo...          0


In [26]:
# download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fawzia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fawzia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### apply text preprocessing 

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# define a regular expression function to preprocess text
def preprocess_text(text):
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # convert to lowercase
    text = text.lower()
    # return preprocessed text
    return text


# apply the preprocessing function to the text column of the dataframe
df['review'] = df['review'].apply(preprocess_text)

# display the preprocessed text
print(df)


                                                  review  sentiment
0                                     aditya ingole deaf          2
1      i love the app there is no issue but if u coul...          1
2      so hard to use the web app failed and the mobi...          0
3      i hate that the app makes a sound every time s...          1
4      useless at bse star mf meetvoice too mych slow...          0
...                                                  ...        ...
40513  is good to show my everyday living example as ...          2
40514                                very good this apps          1
40515  the app was very good until recently im not ab...          1
40516  i cant see any background effects there is no ...          1
40517  unable to login from browser the whole day tod...          0

[40518 rows x 2 columns]


### split data

In [30]:
# split the DataFrame into features and labels
X = df['review']  
y = df['sentiment']               # select only the target column as the label
print(X)
print(y)

0                                       aditya ingole deaf
1        i love the app there is no issue but if u coul...
2        so hard to use the web app failed and the mobi...
3        i hate that the app makes a sound every time s...
4        useless at bse star mf meetvoice too mych slow...
                               ...                        
40513    is good to show my everyday living example as ...
40514                                  very good this apps
40515    the app was very good until recently im not ab...
40516    i cant see any background effects there is no ...
40517    unable to login from browser the whole day tod...
Name: review, Length: 40518, dtype: object
0        2
1        1
2        0
3        1
4        0
        ..
40513    2
40514    1
40515    1
40516    1
40517    0
Name: sentiment, Length: 40518, dtype: int64


In [31]:
from sklearn.model_selection import train_test_split


# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print the size of the training and test sets
print(f'Training set size: {len(X_train)}')
print(f'Test set size: {len(X_test)}')

Training set size: 32414
Test set size: 8104


### apply vectorizer

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

# create a CountVectorizer object
vectorizer = CountVectorizer()

# fit the vectorizer to the text data
vectorizer.fit(df['review'])

# transform the text data into a matrix of features
X = vectorizer.transform(df['review'])

# print the shape of the feature matrix
print(f'Feature matrix shape: {X.shape}')

Feature matrix shape: (40518, 20399)


### create pipeline 

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# create a pipeline with CountVectorizer and LogisticRegression
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', LogisticRegression())
])

# fit the pipeline to the data
pipeline.fit(X_train, y_train)

# evaluate the pipeline on the test data
score = pipeline.score(X_test, y_test)
print(f'Accuracy: {score}')


Accuracy: 0.6578232971372162


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### print report

In [35]:
from sklearn.metrics import classification_report

# predict labels for the test data
y_pred = pipeline.predict(X_test)

# print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.68      0.69      2676
           1       0.54      0.45      0.49      2324
           2       0.69      0.79      0.74      3104

    accuracy                           0.66      8104
   macro avg       0.64      0.64      0.64      8104
weighted avg       0.65      0.66      0.65      8104

