### Import necessary libraries

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd
import joblib
from tqdm import tqdm
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Load the dataset

In [None]:
df = pd.read_csv('..\data\IMDB Dataset.csv')

### Preprocess the loaded data

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [None]:
df['review'] = df['review'].progress_apply(preprocess_text)

### Save the preprocessed data

In [None]:
df.to_csv("../data/imdb_prerocessed_dataset.csv")

### Split dataset into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

### Define pipeline

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

### Train the model

In [None]:
pipeline.fit(X_train, y_train)

### Evaluate the model

In [None]:
accuracy = accuracy_score(y_test, pipeline.predict(X_test))
print("Accuracy:", accuracy)

### Save the model

In [None]:
joblib.dump(pipeline, '../models/sentiment_classifier.pkl')