In [24]:
import pandas as pd

# Define the column names based on the structure of your dataset
columns = ['target', 'id', 'date', 'query', 'user', 'text']

# Load your dataset with the correct column names
df = pd.read_csv('data_set.csv', encoding='ISO-8859-1', header=None, names=columns)

# Check the first few rows to verify
print(df.head())

# Sample 500 records equally from each class (0, 2, 4)
df_sample = df.groupby('target', group_keys=False).apply(lambda x: x.sample(n=30000, random_state=42))

# Save the sampled dataset
df_sample.to_csv('sampled_dataset.csv', index=False)


   target          id                          date     query  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [25]:
!pip install scikit-learn


import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Download stopwords
nltk.download('stopwords')

# Load dataset (modify path if needed)
df = pd.read_csv("sampled_dataset.csv", encoding='latin-1', header=None, usecols=[0, 5], names=['target', 'text'])

# Convert sentiment labels (0 = negative, 2 = neutral, 4 = positive)
df['target'] = df['target'].replace({0: 'Negative', 2: 'Neutral', 4: 'Positive'})

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters & numbers
    text = text.lower().strip()  # Convert to lowercase and trim spaces
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text

df['text'] = df['text'].apply(preprocess_text)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words
X = vectorizer.fit_transform(df['text'])
y = df['target']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train RandomForestClassifier utilizing all CPU cores
clf = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


You should consider upgrading via the 'c:\users\cse\nani\myenv\scripts\python.exe -m pip install --upgrade pip' command.




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Accuracy: 0.70


In [41]:
input_text1 = "hello im fine"
input_text2 = "very very dangerous"

# Preprocess both input texts
input_text1_processed = preprocess_text(input_text1)
input_text2_processed = preprocess_text(input_text2)

# Convert the input texts to numerical features using the same TF-IDF vectorizer
input_vector1 = vectorizer.transform([input_text1_processed])
input_vector2 = vectorizer.transform([input_text2_processed])

# Predict the sentiment for both inputs
predicted_sentiment1 = clf.predict(input_vector1)
predicted_sentiment2 = clf.predict(input_vector2)

# Map sentiment values to labels
sentiment_map = {
    'Negative': 0,
    'Neutral': 2,
    'Positive': 4
}

# Get the predicted sentiment labels
predicted_label1 = predicted_sentiment1[0]
predicted_label2 = predicted_sentiment2[0]

# Function to print sentiment with explanation
def print_sentiment(input_text, predicted_label):
    if predicted_label == 'Negative':
        print(f"The sentiment of the input text '{input_text}' is: Negative (0)")
    elif predicted_label == 'Neutral':
        print(f"The sentiment of the input text '{input_text}' is: Neutral (2)")
    else:
        print(f"The sentiment of the input text '{input_text}' is: Positive (4)")

# Print the result for both input texts
print_sentiment(input_text1, predicted_label1)
print_sentiment(input_text2, predicted_label2)


The sentiment of the input text 'hello im fine' is: Positive (4)
