<a href="https://colab.research.google.com/github/chalakajaniththa/data/blob/main/research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

In [3]:
#read csv file and create dataframe

df = pd.read_csv('/content/drive/MyDrive/research/dataset/modified-sentence.csv')

In [6]:
# Filter rows where the Language is either "Sin-Eng" or "Mixed"
filtered_df = df[df['Language'].isin(['Sin-Eng', 'Mixed'])]

# Select the "Sentence" and "Sentiment" columns from the filtered DataFrame
selected_columns = filtered_df[["Sentence", "Sentiment"]]

# Display the selected columns
print(selected_columns)


                                               Sentence Sentiment
0           Ammage Adarayta❤️Eka Dawasak Madi Neda❤️🙏❤️  Negative
3     chandimal.. uuu thama mulu tem ekama kaaa gaha...  Positive
5               Lebsack 49k dammama eka dawasak wath be  Negative
6     eth anith kattiya sathiyak withara online inna...   Neutral
7                           meka salli kanawane...ai e?  Negative
...                                                 ...       ...
9586  why, how and when did you put a expiry date fo...  Negative
9593                                Gerhold nam jarawak  Negative
9594  Me mase web family plus packge eke total eka 6...   Neutral
9595        munge Dan data kapana widiye awulk thiyenwa  Negative
9598    Signal ne ne anee. Ekata mokak hari karannakoo.  Negative

[4579 rows x 2 columns]


In [7]:
from sklearn.utils import resample

# Separate the DataFrame into classes based on 'Sentiment'
positive_class = selected_columns[selected_columns['Sentiment'] == 'Positive']
negative_class = selected_columns[selected_columns['Sentiment'] == 'Negative']
neutral_class = selected_columns[selected_columns['Sentiment'] == 'Neutral']
conflict_class = selected_columns[selected_columns['Sentiment'] == 'Conflict']

# Determine the class with the maximum number of samples
max_samples = max(len(positive_class), len(negative_class), len(neutral_class), len(conflict_class))

# Up-sample the classes with fewer samples
positive_upsampled = resample(positive_class, replace=True, n_samples=max_samples, random_state=42)
negative_upsampled = resample(negative_class, replace=True, n_samples=max_samples, random_state=42)
neutral_upsampled = resample(neutral_class, replace=True, n_samples=max_samples, random_state=42)
conflict_upsampled = resample(conflict_class, replace=True, n_samples=max_samples, random_state=42)

# Combine the up-sampled classes into a balanced DataFrame
balanced_df = pd.concat([positive_upsampled, negative_upsampled, neutral_upsampled, conflict_upsampled])

# Display the balanced DataFrame
print(balanced_df)


                                               Sentence Sentiment
2162                                Call time ehma mru.  Positive
9568                  Thank you 6GB bonus ekak dunnata.  Positive
6701                                   Gerhold niyamai.  Positive
2234                                      Sathutui 😃😃😃😃  Positive
1460  FB Messenger is actually one of the best messa...  Positive
...                                                 ...       ...
4843  Hodai habai signal na miss call gahala salli k...  Conflict
569                    Signal ne..nthnm Hilpert hodai😭😭  Conflict
301                    Bartell walada wada hida Lebsack  Conflict
1098  apit kemati Gerhold pawichchi karanna et signa...  Conflict
1680  eka thama hodama eth time base yaddi connecrio...  Conflict

[11348 rows x 2 columns]


In [8]:
# Check the number of samples for each sentiment class in the balanced DataFrame
class_counts = balanced_df['Sentiment'].value_counts()

# Display the class counts
print(class_counts)


Positive    2837
Negative    2837
Neutral     2837
Conflict    2837
Name: Sentiment, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming balanced_df is your balanced DataFrame with "Sentence" and "Sentiment" columns
X = balanced_df['Sentence']
y = balanced_df['Sentiment']

# Convert string labels to integer labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Create and compile the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_padded.shape[1]))
model.add(LSTM(100))
model.add(Dense(4, activation='softmax'))  # Assuming 4 classes (Positive, Negative, Neutral, Conflict)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 94.14%
