In [None]:
! pip install laser_encoders
! pip install chardet

In [None]:
import numpy as np
import pandas as pd
import chardet
from laser_encoders import LaserEncoderPipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tqdm import tqdm

In [None]:
with open('/content/drive/MyDrive/dataset/train.csv', 'rb') as f:
    result = chardet.detect(f.read())

# Use the detected encoding when reading the CSV file
data = pd.read_csv('/content/drive/MyDrive/dataset/train.csv', encoding=result['encoding'])
data = data[['sentiment', 'text']]

In [None]:
print(data.head())
print(data.shape)

In [None]:
sentiments = []
texts = []

for index, row in data.iterrows():
    sentiment = row['sentiment'].lower()  # Convert to lowercase for case-insensitivity
    if sentiment == 'neutral':
        sentiments.append(1)
    elif sentiment == 'positive':
        sentiments.append(2)
    elif sentiment == 'negative':
        sentiments.append(3)
    else:
        # Handle the case where sentiment is not one of the expected values
        # You may choose to skip this row or handle it differently based on your requirements
        print(f"Warning: Unknown sentiment '{sentiment}' in row {index}")

    text = row['text']
    texts.append(text)

print(len(sentiments))
print(len(texts))
sentiments = sentiments[:300] + sentiments[400:]
texts = texts[:300] + texts[400:]

In [None]:
label_encoder = LabelEncoder()
encoded_sentiments = label_encoder.fit_transform(sentiments)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_sentiments, test_size=0.2, random_state=42)

# Initialize the LaserEncoder
encoder = LaserEncoderPipeline(lang="eng_Latn")

In [None]:
# Initialize empty arrays to store embeddings
X_train_embeddings = []
X_test_embeddings = []

# Encode sentences line-wise using tqdm for progress visualization
print("Encoding training sentences:")
for sentence in tqdm(X_train):
    embeddings = encoder.encode_sentences([sentence])[0]
    X_train_embeddings.append(embeddings)

print("Encoding testing sentences:")
for sentence in tqdm(X_test):
    embeddings = encoder.encode_sentences([sentence])[0]
    X_test_embeddings.append(embeddings)

# Convert lists to numpy arrays
X_train_embeddings = np.array(X_train_embeddings)
X_test_embeddings = np.array(X_test_embeddings)

# # Encode sentences line-wise
# X_train_embeddings = np.array([encoder.encode_sentences([sentence])[0] for sentence in X_train])
# X_test_embeddings = np.array([encoder.encode_sentences([sentence])[0] for sentence in X_test])

In [None]:
# Build a simple neural network model
model = Sequential()
model.add(Dense(64, input_shape=(1024,), activation='relu'))
model.add(Dense(3, activation='softmax'))  # Assuming 3 classes (neutral, positive, negative)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_embeddings, y_train, epochs=20, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
accuracy = model.evaluate(X_test_embeddings, y_test)[1]
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Now, you can use the trained model to predict the sentiment of user input
user_text = input("Enter a text: ")
user_text_embedding = encoder.encode_sentences([user_text])[0]
user_text_embedding = np.reshape(user_text_embedding, (1, -1))

predicted_sentiment = np.argmax(model.predict(user_text_embedding))
predicted_sentiment_no = label_encoder.inverse_transform([predicted_sentiment])[0]
if predicted_sentiment_no == 1:
  predicted_sentiment_label = 'neutral'
elif predicted_sentiment_no == 2:
  predicted_sentiment_label = 'positive'
else:
  predicted_sentiment_label = 'negative'

print(f"Predicted Sentiment: {predicted_sentiment_label}")