## Import the required libraries

In [32]:
# Standard library imports
from re import sub

# Third-party libraries
from contractions import fix
import nltk
from sklearn.model_selection import cross_val_score
from pandas import DataFrame, read_csv, Series
from joblib import dump, load
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import nlpaug.augmenter.word as naw

# NLTK downloads
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

## Define the hyper-parameters

In [33]:
# Regular hyperparameters
lemmatize = False
combine_fields = ['from', 'director', 'title']
stop_words = set(stopwords.words('english'))

# Fine-tuning hyperparameters
threshold = 0
max_features = 10000
ngram_range = (1, 6)

# File paths
train_path = 'train.txt'
test_path = 'test_no_labels.txt'
output_path = 'results.txt'
label_encoder_path = f'models/label_encoder_{int(threshold * 100)}.plk'
vectorizer_path = f'models/vectorizer_{int(threshold * 100)}.plk'
svm_path = f'models/svm/svm_{int(threshold * 100)}.plk'
mnb_path = f'models/mnb/mnb_{int(threshold * 100)}.plk'

## Load data

In [34]:
# Load data
def load_data():
    data = read_csv(train_path, sep='\t', names=['title', 'from', 'genre', 'director', 'plot'])
    return data

In [35]:
data = load_data()
data.head()

Unnamed: 0,title,from,genre,director,plot
0,Ela Cheppanu,Telugu,romance,Ramana,Sekhar (Tarun) is a graduate from IIM and work...
1,A Nightmare on Elm Street,American,horror,Samuel Bayer,Kris Fowles (Katie Cassidy) goes to the Spring...
2,American Gothic,American,horror,John Hough,Cynthia is traumatized by the death of her bab...
3,Gang,Bollywood,crime,Mazhar Khan,"Four friends, Gangu (Jackie Shroff), Abdul (Na..."
4,Intimate Relations,British,drama,Charles Frank,Crisis in a middle-class family when the son f...


### Combine fields

In [36]:
# Combine fields
def combine():
    # Extract relevant columns (plot, combine_fields, genre)
    selected_fields = ['plot'] + combine_fields
    if 'genre' in data.columns:
        selected_fields.append('genre')
    combined_data = data[selected_fields].copy()
    
    # Handle missing values: drop rows with missing 'plot', fill missing combine_fields with ''
    combined_data.dropna(subset=['plot'], inplace=True)  # Ensure 'plot' is not NaN
    for field in combine_fields:
        combined_data[field] = combined_data[field].fillna('')  # Replace NaN in combine_fields with empty strings

    # Combine plot and other specified fields into a single feature
    combined_data['combined_text'] = combined_data['plot']
    for field in combine_fields:
        combined_data['combined_text'] += ' ' + combined_data[field]

    return combined_data

In [37]:
data = combine()

### Data augmentation

In [38]:
# Initialize the SynonymAugmenter
aug = naw.SynonymAug(aug_src='wordnet', aug_max=30)

# Define augmentation settings for each genre
augmentation_config = {
    'sci-fi': 3,    # Augment 3 times
    'animation': 1, # Augment 1 time
    'crime': 1,     # Augment 1 time
}

# Not augmentening the remaining genre
for genre in data['genre'].unique():
    if genre not in augmentation_config:
        augmentation_config[genre] = 0  # Default to augmenting once

# List to hold original and augmented text along with their genres
all_texts = []
all_genres = []

# Loop through each genre and apply the augmentation as needed
for genre, n in augmentation_config.items():
    genre_data = data[data['genre'] == genre].copy()
    
    # Add original texts and genres to the lists
    all_texts.extend(genre_data['combined_text'].tolist())
    all_genres.extend([genre] * len(genre_data))  # Repeat the genre for the number of rows
    
    # Apply augmentation if n > 0
    if n > 0:
        augmented_texts = genre_data['combined_text'].apply(lambda x: aug.augment(x, n=n))
        # Flatten the lists of augmented texts
        augmented_texts = augmented_texts.explode().tolist()
        # Add augmented texts and genres to the lists
        all_texts.extend(augmented_texts)
        all_genres.extend([genre] * len(augmented_texts))  # Repeat the genre for the number of augmented rows

# Create a DataFrame with the text and genre columns
final_df = DataFrame({'text': all_texts, 'genre': all_genres})

## Clean the text

In [39]:
# Clean the text
def clean_text(text):
    # Check if the text is a non-empty string
    if not isinstance(text, str) or not text.strip():
        return ''
    
    # Skip applying contractions if the text is too long or complex
    if len(text) > 500:  # Threshold to skip contraction expansion for long texts
        return text
    
    # Try expanding contractions safely
    try:
        text = fix(text)
    except Exception as e:
        print(f"Error expanding contractions: {e}, for text: {text[:100]}...")  # Only print the first 100 characters
        return text  # Return the original text if expansion fails
    
    # Remove special characters and digits
    text = sub(r'[^a-zA-Z\s]', '', text)
    
    return text

## Function to convert nltk POS tag to wordnet POS tag

In [40]:
# Function to convert nltk POS tag to wordnet POS tag
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun

## Lemmatize the text

In [41]:
# Function to lemmatize text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    # Clean the text
    text = clean_text(text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    # Remove non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    # Perform POS tagging
    tagged_tokens = nltk.pos_tag(tokens)
    # Lemmatize each token using the POS tag
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged_tokens]
    return ' '.join(lemmatized_tokens)

## Preprocess the data

In [42]:
# Preprocess the data without stop words
def preprocess_data():
    # Apply lemmatization if specified
    if lemmatize:
        final_df['text'] = final_df['text'].apply(lemmatize_text)
    else:
        # Clean the text anyway
        final_df['text'] = final_df['text'].apply(clean_text)

    return final_df

In [43]:
data = preprocess_data()
data.head()

Unnamed: 0,text,genre
0,"The protagonist, space archaeologist Simon Wat...",sci-fi
1,A woman (Lanell Cado) steps out of a shower an...,sci-fi
2,The plot involves rogue elements of the commun...,sci-fi
3,"Set in contemporary England, the film follows ...",sci-fi
4,The US military is running a test for a specia...,sci-fi


## Split data into training and test sets

In [44]:
# Split data into training and test sets
def split_data():
    X = data['text']
    y = data['genre']
    # Encode labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42, stratify=y_encoded)
    return X_train, X_test, y_train, y_test, label_encoder

In [45]:
X_train, X_test, y_train, y_test, label_encoder = split_data()

In [46]:
# Save label encoder
dump(label_encoder, label_encoder_path)

['models/label_encoder_0.plk']

## Stopwords filtering

In [47]:
X_train_series = Series(X_train)

# CountVectorizer to create a document-term matrix for stop words
vectorizer = CountVectorizer(vocabulary=stop_words, binary=True)  # Count presence
X = vectorizer.fit_transform(X_train_series)  # Transform the title-plot feature

# Compute document frequencies
doc_freq = X.sum(axis=0) / X.shape[0]  # Fraction of documents (observations) containing each stop word

# Sort stop words by document frequency.
stop_word_df = DataFrame({
    'stop_word': vectorizer.get_feature_names_out(),
    'doc_frequency': doc_freq.A1
}).sort_values(by='doc_frequency', ascending=False)

# Set a threshold for significant stop words
def threshold_stop_word_df():
  return stop_word_df[stop_word_df['doc_frequency'] >= threshold]

stop_words_filtered = threshold_stop_word_df()['stop_word'].tolist()
stop_words_filtered += data['genre'].unique().tolist()

In [48]:
# Feature extraction function
def extract_features():
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        sublinear_tf=True,
        norm='l2', stop_words=stop_words_filtered
    )
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, vectorizer


X_train_tfidf, X_test_tfidf, vectorizer = extract_features()



In [49]:
# Save vectorizers
dump(vectorizer, vectorizer_path)

['models/vectorizer_0.plk']

## Train SVM with hyperparameter tuning

In [50]:
#kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

In [51]:
def train_svm():
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear'],
        'class_weight': ['balanced', None]
    }
    grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train_tfidf, y_train)
    best_model = grid_search.best_estimator_
    
    print(f"Best SVM Parameters: {grid_search.best_params_}")
    return best_model

In [52]:
# SVM training 
svm_tfidf = train_svm()

Best SVM Parameters: {'C': 1, 'class_weight': 'balanced', 'kernel': 'linear'}


In [53]:
# Save SVM models
dump(svm_tfidf, svm_path)

['models/svm/svm_0.plk']

## Train Multinomial NB with hyperparameter tuning

In [54]:
# Train Multinomial NB with hyperparameter tuning
def train_MultinomialNB():
    param_grid = {
        'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Smoothing parameter
        'fit_prior': [True, False],           # Whether to learn class prior probabilities
    }
    
    grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train_tfidf, y_train)    
    best_model = grid_search.best_estimator_
    _
    print(f"Best Multinomial NB Parameters: {grid_search.best_params_}")
    
    return best_model

In [55]:
# MultinomialNB training 
MultinomialNB_tfidf = train_MultinomialNB()

Best Multinomial NB Parameters: {'alpha': 0.1, 'fit_prior': True}


In [56]:
# Save MultinomialNB models
dump(MultinomialNB_tfidf, mnb_path)

['models/mnb/mnb_0.plk']

## Evaluate the model

In [57]:
# Evaluate the model
def evaluate_model(model):
    predictions = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, target_names=label_encoder.classes_)
    cm = confusion_matrix(y_test, predictions)
    return accuracy, cm, report

In [58]:
accuracy_svm = evaluate_model(svm_tfidf)[0]
print(f"Accuracy SVM_tfidf: {accuracy_svm * 100:.2f}%")

accuracy_mnb = evaluate_model(MultinomialNB_tfidf)[0]
print(f"Accuracy MultinomialNB_tfidf: {accuracy_mnb * 100:.2f}%")

Accuracy SVM_tfidf: 77.46%
Accuracy MultinomialNB_tfidf: 72.85%


## Load the model and vectorizer

In [59]:
# Load a certain model
def load_model(model_path, vectorizer_path, label_encoder_path):
    model = load(model_path)
    vectorizer = load(vectorizer_path)
    label_encoder = load(label_encoder_path)
    return model, vectorizer, label_encoder

In [60]:
# Determine the best model
if accuracy_svm >= accuracy_mnb:
    best_model_path = svm_path
else:
    best_model_path = mnb_path

# Load the best model
model, vectorizer, label_encoder = load_model(best_model_path, vectorizer_path, label_encoder_path)

## Perform cross-validation on the best model

In [61]:
# Perform cross-validation
cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy for the best model: {cv_scores.mean() * 100:.2f}% (+/- {cv_scores.std() * 100:.2f}%)")

Cross-validation Accuracy for the best model: 77.76% (+/- 1.12%)


## Predict the genre for new movie plots

In [62]:
# Predict the genre of the test data
def predict_genre():
    data = read_csv(test_path, sep='\t', names=['title', 'from', 'director', 'plot'])
    data = preprocess_data()
    plots = data['combined_text']
    plot_vectors = vectorizer.transform(plots)
    predicted_genres_encoded = model.predict(plot_vectors)
    predicted_genres = label_encoder.inverse_transform(predicted_genres_encoded)

    # Save the results to a file
    data['genre'] = predicted_genres
    data[['title', 'from', 'director', 'plot', 'genre']].to_csv(output_path, sep='\t', index=False)

In [63]:
# predict_genre()