<a href="https://colab.research.google.com/github/chidambarambaskaran/machinne-learning-projects/blob/main/Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')

df['Review'] = df['Review'].str.lower()

documents = df['Review'].tolist()
labels = df['Liked'].tolist()

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    return ' '.join(stemmed_tokens)

# Preprocessing documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(preprocessed_documents, labels, test_size=0.2, random_state=42)

# Vectorize preprocessed text using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a logistic regression model
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Evaluating the model
y_pred = clf.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.755
