In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

# Read the training and test data
train_data = pd.read_csv('Punjabi_train.csv', encoding='utf-8')
test_data = pd.read_csv('Punjabi_test.csv', encoding='utf-8')

# Preprocess the training and test data
X_train = train_data['Sentence'].astype(str)
X_test = test_data['Sentence'].astype(str)

# Convert labels to numbers
label_mapping = {'Appreciation': 1, 'Disappointment': 0, 'Hate': 2, 'Blessing': 3, 'Support': 4, 'Neutral': 5, 'Sadness': 6}

# Check unique labels in the data
unique_train_labels = train_data['Label'].unique()
unique_test_labels = test_data['Label'].unique()
print("Unique labels in training data:", unique_train_labels)
print("Unique labels in test data:", unique_test_labels)

# Ensure all labels are in the label mapping
invalid_train_labels = [label for label in unique_train_labels if label not in label_mapping]
invalid_test_labels = [label for label in unique_test_labels if label not in label_mapping]
print("Invalid labels in training data:", invalid_train_labels)
print("Invalid labels in test data:", invalid_test_labels)

# Filter out invalid labels or replace them
train_data = train_data[train_data['Label'].isin(label_mapping)]
test_data = test_data[test_data['Label'].isin(label_mapping)]

# Update X_train and y_train after filtering
X_train = train_data['Sentence'].astype(str)
y_train = train_data['Label'].map(label_mapping)

# Update X_test and y_test after filtering
X_test = test_data['Sentence'].astype(str)
y_test = test_data['Label'].map(label_mapping)

# Check for consistent lengths
assert len(X_train) == len(y_train), "Training data and labels size mismatch"
assert len(X_test) == len(y_test), "Test data and labels size mismatch"

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# LSTM model
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=200)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)

model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=200),
    LSTM(64),
    Dense(7, activation='softmax')  # 7 classes as per label_mapping
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluation of LSTM model
y_pred_dl_prob = model.predict(X_test_pad)
y_pred_dl = y_pred_dl_prob.argmax(axis=-1)
accuracy_dl = accuracy_score(y_test, y_pred_dl)
precision_dl = precision_score(y_test, y_pred_dl, average='weighted')
recall_dl = recall_score(y_test, y_pred_dl, average='weighted')
f1_dl = f1_score(y_test, y_pred_dl, average='weighted')


print("Accuracy:", accuracy_dl)
print("Precision:", precision_dl)
print("Recall:", recall_dl)
print("F1 Score:", f1_dl)



new_text = "ਮੈਂ ਚੰਗਾ ਹਾਂ"    # Example text
new_text_seq = tokenizer.texts_to_sequences([new_text])
new_text_pad = pad_sequences(new_text_seq, maxlen=200)
predicted_sentiment_dl_prob = model.predict(new_text_pad)
predicted_sentiment_dl = predicted_sentiment_dl_prob.argmax(axis=-1)



# Read the sentences from the text file
with open('punjabi', 'r', encoding='utf-8') as file:
    sentences = file.readlines()

results = []

# Predict sentiments for each sentence
for sentence in sentences:
    sentence = sentence.strip()



    # LSTM prediction
    sentence_seq = tokenizer.texts_to_sequences([sentence])
    sentence_pad = pad_sequences(sentence_seq, maxlen=200)
    predicted_sentiment_dl_prob = model.predict(sentence_pad)
    predicted_sentiment_dl = predicted_sentiment_dl_prob.argmax(axis=-1)[0]

    # Map the predicted sentiment back to the label
    sentiment_mapping = {1:'Appreciation',0: 'Disappointment',2: 'Hate',3: 'Blessing', 4:'Support',5: 'Neutral',6: 'Sadness'}


    #predicted_sentiment_nb_label = sentiment_mapping[predicted_sentiment_nb]
    predicted_sentiment_dl_label = sentiment_mapping[predicted_sentiment_dl]

    # Append the results for each model (NB and DL)
    #results.append([predicted_sentiment_nb_label, sentence])
    results.append([predicted_sentiment_dl_label, sentence])

# Create a DataFrame and save to CSV
results_df = pd.DataFrame(results, columns=['Sentiment', 'Sentence'])
results_df.to_csv('punjabi_predicted.csv', index=False, encoding='utf-8')

# Download the CSV file
from google.colab import files
files.download('punjabi_predicted.csv')


Unique labels in training data: ['Blessing' 'Appreciation' 'Disappointment' 'Hate' 'Neutral' 'Hate '
 'Sadness' 'Support']
Unique labels in test data: ['Appreciation' 'Sadness' 'Neutral' 'Support' 'Disappointment' 'Hate'
 'Blessing']
Invalid labels in training data: ['Hate ']
Invalid labels in test data: []
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.3691275167785235
Precision: 0.29372190120709657
Recall: 0.3691275167785235
F1 Score: 0.32412101440713237


  _warn_prf(average, modifier, msg_start, len(result))




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')