<a href="https://colab.research.google.com/github/dawood5253/Consumer-Complaint-Resolution/blob/main/NLP_Miniproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
#Read the Data from the Given excel file

twit=pd.read_csv('Twitter_Data.csv')
twit

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [None]:
#Change our dependent variable to categorical. (0 to “Neutral,”-1 to “Negative”, 1 to “Positive”)
twit['category']=twit['category'].replace({0:'Neutral',1:'Positive',-1:'Negative'})

In [None]:
twit

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive
...,...,...
162975,why these 456 crores paid neerav modi not reco...,Negative
162976,dear rss terrorist payal gawar what about modi...,Negative
162977,did you cover her interaction forum where she ...,Neutral
162978,there big project came into india modi dream p...,Neutral


In [None]:
#Do Missing value analysisand drop all null/missing values

twit.isna().sum()

clean_text    4
category      7
dtype: int64

In [None]:
twit[twit['clean_text'].isna()]

Unnamed: 0,clean_text,category
148,,Neutral
158694,,Negative
159443,,Neutral
160560,,Positive


In [None]:
twit.dropna(inplace=True)

In [None]:
twit

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive
...,...,...
162975,why these 456 crores paid neerav modi not reco...,Negative
162976,dear rss terrorist payal gawar what about modi...,Negative
162977,did you cover her interaction forum where she ...,Neutral
162978,there big project came into india modi dream p...,Neutral


In [None]:
#Do text cleaning. (remove every symbol except alphanumeric, transform all words to lower case, and remove punctuationand stopwords )

def alph(text):
  new_text=''
  for w in text:
    if w.isalnum() or w.isspace():
      new_text+=w

  return new_text

In [None]:
twit['clean_text']=twit['clean_text'].str.lower()

In [None]:
import string
exclude = string.punctuation
exclude

def remove_punc(text):
    for char in exclude:
        text = text.replace(char ,  ' ')
    return text

In [None]:
twit['clean_text']=twit['clean_text'].apply(alph)

In [None]:
twit['clean_text']=twit['clean_text'].apply(remove_punc)

In [None]:
import nltk
from nltk.corpus import stopwords
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return ' '.join(x)

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
twit['clean_text']=twit['clean_text'].apply(remove_stopwords)

In [None]:
#Create a new column and find the length of each sentence (how many words they contain)

def size(text):
    return len(text.split())

In [None]:
twit['text_size']=twit['clean_text'].apply(size)

In [None]:
#Split data into dependent(X) and independent(y) dataframe

X=twit.drop('category',axis=1)
y=twit.loc[:,['category']]

In [None]:
#Do operations on text data
#Do one-hot encoding for each sentence(use TensorFlow)
#Add padding from the front side (use Tensorflow)
#Build an LSTM model and compile it(describe features, input length, vocabulary size, information drop-out layer, activation function for output, )
#Do dummy variable creation for the dependent variable
#split the data into tests and train

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
sent=X['clean_text'].to_list()

In [None]:
token=Tokenizer()
token.fit_on_texts(sent)
seq=token.texts_to_sequences(sent)
ohe=token.texts_to_matrix(sent,mode='binary')

In [None]:
max_length = max(len(s) for s in seq)
padded_sequences = pad_sequences(seq, maxlen=max_length, padding='pre')


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define the vocabulary size
vocab_size = len(token.word_index) + 1

# Define input length (max_length determined from padding)
input_length = max_length

# Build the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=input_length),
    LSTM(units=64),
    Dense(units=1, activation='sigmoid')  # Assuming binary classification, adjust units for multi-class
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 43, 128)           13690240  
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 13739713 (52.41 MB)
Trainable params: 13739713 (52.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Assuming 'category' is the column containing categories
dummy_variables = pd.get_dummies(twit['category'])

# Concatenate the dummy variables with the original DataFrame
data = pd.concat([twit, dummy_variables], axis=1)

# Drop the original 'category' column
twit.drop(columns=['category'], inplace=True)


In [None]:
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['category'], test_size=0.2, random_state=42)

# Print the shapes of training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (130375, 43)
Shape of X_test: (32594, 43)
Shape of y_train: (130375,)
Shape of y_test: (32594,)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [None]:
history=model.fit(X_train,y_train_encoded,epochs=15,validation_data=(X_test,y_test_encoded),batch_size=32)

loss,acc=model.evaluate(X_test,y_test_encoded)

print('Test Loss ',loss)
print('Test Accuracy '.acc)

Epoch 1/15


  return dispatch_target(*args, **kwargs)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Predict probabilities for test data
y_pred_probs = model.predict(X_test)

# Normalize predictions
y_pred = [1 if pred >= 0.5 else 0 for pred in y_pred_probs]

# Compute accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred))


Accuracy: 0.4624777566423268
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.76      0.71      7152
           1       0.40      0.87      0.54     11067
           2       0.00      0.00      0.00     14375

    accuracy                           0.46     32594
   macro avg       0.35      0.54      0.42     32594
weighted avg       0.28      0.46      0.34     32594

