In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC

In [3]:
pr_df = pd.read_csv('train.tsv', sep='\t', header=None, names=['sentence', 'label'])
pr_df.values

array([["it seem fincher 's status as a flick maker who seems bends franchise know-how to the service of emotions perry .",
        1],
       ["everyone and sit as deep as the sitcom action and a quirky ole ' collective in the formula of language .",
        0],
       ["with tightly organized efficiency , numerous flashbacks and a stays edge of raunchy , reginald 's movies is disney of 2167 's involvingly asks memory .",
        1],
       ...,
       ['just as star , uplifting and entertained as always .', 1],
       ['the movies world a fatal mistake : it hand string to unfortunately about a infantile along storytelling only apparent virtue is that he is not full as awful as some of the three in his worse .',
        0],
       ['art non-stop techno or the existential overtones of a kieslowski logic wants , maelström is just recent winter sleepers .',
        0]], shape=(697, 2), dtype=object)

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Load dataset (no header in file → header=None)
df = pd.read_csv("train.tsv", sep="\t", header=None, names=["text", "label"])

print("Dataset shape:", df.shape)
print(df.head())

# 2. Basic cleaning (optional but safe)
df = df.dropna(subset=["text", "label"])
df["text"] = df["text"].astype(str)

X = df["text"].values
y = df["label"].values

# 3. Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Build TF-IDF + Linear SVM pipeline
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",   # remove if you don’t want this
    ngram_range=(1, 2),     # unigrams + bigrams
    max_df=0.9,
    min_df=2
)

clf = LinearSVC(random_state=42)

# 5. Fit on training data
X_train_tfidf = vectorizer.fit_transform(X_train)
clf.fit(X_train_tfidf, y_train)

# 6. Evaluate on validation data
X_val_tfidf = vectorizer.transform(X_val)
y_pred = clf.predict(X_val_tfidf)

print("\nAccuracy:", accuracy_score(y_val, y_pred))
print("\nClassification report:\n", classification_report(y_val, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_val, y_pred))

# 7. Example: predict on new sentences
example_texts = [
    "this movie was amazing and I loved it",
    "absolutely terrible, waste of time"
]
example_vec = vectorizer.transform(example_texts)
example_pred = clf.predict(example_vec)
print("\nExample predictions:")
for txt, p in zip(example_texts, example_pred):
    print(f"  [{p}] {txt}")

Dataset shape: (697, 2)
                                                text  label
0  it seem fincher 's status as a flick maker who...      1
1  everyone and sit as deep as the sitcom action ...      0
2  with tightly organized efficiency , numerous f...      1
3  pic chai encourages rueful romantic at stereot...      0
4  it 's stars heck of a charming grim -- not of ...      1

Accuracy: 0.5857142857142857

Classification report:
               precision    recall  f1-score   support

           0       0.57      0.62      0.60        69
           1       0.60      0.55      0.57        71

    accuracy                           0.59       140
   macro avg       0.59      0.59      0.59       140
weighted avg       0.59      0.59      0.59       140


Confusion matrix:
 [[43 26]
 [32 39]]

Example predictions:
  [1] this movie was amazing and I loved it
  [0] absolutely terrible, waste of time
