In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Load dataset (no header in file → header=None)
df = pd.read_csv("train.tsv", sep="\t", header=None, names=["text", "label"])
df = df[~((df["text"] == "sentence") & (df["label"] == "label"))]
print("Dataset shape:", df.shape)
print(df.head())

# 2. Basic cleaning (optional but safe)
df = df.dropna(subset=["text", "label"])
df["text"] = df["text"].astype(str)

X = df["text"].values
y = df["label"].values

# 3. Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Build TF-IDF + Linear SVM pipeline
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",   # remove if you don’t want this
    ngram_range=(1, 2),     # unigrams + bigrams
    max_df=0.9,
    min_df=2
)

clf = LinearSVC(random_state=42)

# 5. Fit on training data
X_train_tfidf = vectorizer.fit_transform(X_train)
clf.fit(X_train_tfidf, y_train)

# 6. Evaluate on validation data
X_val_tfidf = vectorizer.transform(X_val)
y_pred = clf.predict(X_val_tfidf)

print("\nAccuracy:", accuracy_score(y_val, y_pred))
print("\nClassification report:\n", classification_report(y_val, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_val, y_pred))

# 7. Example: predict on new sentences
example_texts = [
    "this movie was amazing and I loved it",
    "absolutely terrible, waste of time"
]
example_vec = vectorizer.transform(example_texts)
example_pred = clf.predict(example_vec)
print("\nExample predictions:")
for txt, p in zip(example_texts, example_pred):
    print(f"  [{p}] {txt}")

Dataset shape: (697, 2)
                                                text label
0  it confirms fincher 's status as a film maker ...     1
1  hit and miss as far as the comedy goes and a b...     0
2  with tightly organized efficiency , numerous f...     1
3  american chai encourages rueful laughter at st...     0
4  it 's one heck of a character study -- not of ...     1

Accuracy: 0.6928571428571428

Classification report:
               precision    recall  f1-score   support

           0       0.67      0.74      0.70        69
           1       0.72      0.65      0.68        71

    accuracy                           0.69       140
   macro avg       0.69      0.69      0.69       140
weighted avg       0.70      0.69      0.69       140


Confusion matrix:
 [[51 18]
 [25 46]]

Example predictions:
  [1] this movie was amazing and I loved it
  [1] absolutely terrible, waste of time


In [4]:
df.values

array([["it confirms fincher 's status as a film maker who artfully bends technical know-how to the service of psychological insight . ",
        '1'],
       ["hit and miss as far as the comedy goes and a big ole ' miss in the way of story . ",
        '0'],
       ["with tightly organized efficiency , numerous flashbacks and a constant edge of tension , miller 's film is one of 2002 's involvingly adult surprises . ",
        '1'],
       ...,
       ['just as moving , uplifting and funny as ever . ', '1'],
       ['the film makes a fatal mistake : it asks us to care about a young man whose only apparent virtue is that he is not quite as unpleasant as some of the people in his life . ',
        '0'],
       ['without non-stop techno or the existential overtones of a kieslowski morality tale , maelström is just another winter sleepers . ',
        '0']], shape=(697, 2), dtype=object)

In [3]:
)

SyntaxError: unmatched ')' (687023194.py, line 1)