Importing all files

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Set pandas options to show all columns without line break
pd.set_option('display.width', None)  # Set display width to None (auto-detect)
pd.set_option('display.max_columns', None)  # Show all columns

train_data = []

# Open and read the file
with open("train_data.txt", "r", encoding="utf-8") as file:
    for line in file:
        if ":::" in line:
            ID, TITLE, GENRE, DESCRIPTION = line.strip().split(":::", 3)  # Adjusted split to 3
            train_data.append((ID, TITLE, GENRE, DESCRIPTION))

# Create DataFrame with column names
train_df = pd.DataFrame(train_data, columns=["ID", "TITLE", "GENRE", "DESCRIPTION"])

# Show the first few rows
print(train_df.head(10))

label_encoder = LabelEncoder()
train_df['GENRE_ENCODED'] = label_encoder.fit_transform(train_df['GENRE'])
print(train_df)


    ID                               TITLE          GENRE  \
0   1        Oscar et la dame rose (2009)          drama    
1   2                        Cupid (1997)       thriller    
2   3    Young, Wild and Wonderful (1980)          adult    
3   4               The Secret Sin (1915)          drama    
4   5              The Unrecovered (2007)          drama    
5   6              Quality Control (2011)    documentary    
6   7                  "Pink Slip" (2009)         comedy    
7   8                One Step Away (1985)          crime    
8   9            "Desperate Hours" (2016)     reality-tv    
9  10                    Spirits (2014/I)         horror    

                                         DESCRIPTION  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re... 

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           54214 non-null  object
 1   TITLE        54214 non-null  object
 2   GENRE        54214 non-null  object
 3   DESCRIPTION  54214 non-null  object
dtypes: object(4)
memory usage: 1.7+ MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
ID,0
TITLE,0
GENRE,0
DESCRIPTION,0


In [None]:
df.describe()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
count,54214,54214,54214,54214
unique,54214,54214,27,54086
top,54214,Nature's Fury: Storm of the Century (2006),drama,Grammy - music award of the American academy ...
freq,1,1,13613,12


In [None]:

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

test_data = []

# Open and read the file
with open("test_data.txt", "r", encoding="utf-8") as file:
    for line in file:
        if ":::" in line:
            parts = line.strip().split(":::", 3)  # Split into a maximum of 3 parts
            if len(parts) == 3:  # Ensure the line has exactly 4 parts
                ID, TITLE, DESCRIPTION = parts
                test_data.append((ID, TITLE, DESCRIPTION))

# Create DataFrame with column names
test_df = pd.DataFrame(test_data, columns=["ID", "TITLE","DESCRIPTION"])

# Show the first few rows
print(test_df.head())



   ID                          TITLE  \
0  1           Edgar's Lunch (1998)    
1  2       La guerra de papá (1977)    
2  3    Off the Beaten Track (2010)    
3  4         Meu Amigo Hindu (2015)    
4  5              Er nu zhai (1955)    

                                         DESCRIPTION  
0   L.R. Brane loves his life - his car, his apar...  
1   Spain, March 1964: Quico is a very naughty ch...  
2   One year in the life of Albin and his family ...  
3   His father has died, he hasn't spoken with hi...  
4   Before he was known internationally as a mart...  


In [None]:
#  TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_df['DESCRIPTION'])
y_train = train_df['GENRE_ENCODED']
X_test = vectorizer.transform(test_df['DESCRIPTION'])

In [None]:
#  Train the SVM Model
svm = LinearSVC()
svm.fit(X_train, y_train)

#  Predict Genres

predicted_labels = svm.predict(X_test)
predicted_genres = label_encoder.inverse_transform(predicted_labels)
test_df['PREDICTED_GENRE'] = predicted_genres


In [None]:
#  View Sample Predictions
print("\n🎬 Sample Predictions:\n")
for i in range(5):
    print(f"Title: {test_df.iloc[i]['TITLE']}")
    print(f"Plot: {test_df.iloc[i]['DESCRIPTION'][:100]}...")
    print(f"Predicted Genre: {test_df.iloc[i]['PREDICTED_GENRE']}")
    print("-" * 50)



🎬 Sample Predictions:

Title:  Edgar's Lunch (1998) 
Plot:  L.R. Brane loves his life - his car, his apartment, his job, but especially his girlfriend, Vespa. ...
Predicted Genre:  short 
--------------------------------------------------
Title:  La guerra de papá (1977) 
Plot:  Spain, March 1964: Quico is a very naughty child of three belonging to a wealthy middle-class famil...
Predicted Genre:  drama 
--------------------------------------------------
Title:  Off the Beaten Track (2010) 
Plot:  One year in the life of Albin and his family of shepherds in the North of Transylvania. In direct c...
Predicted Genre:  documentary 
--------------------------------------------------
Title:  Meu Amigo Hindu (2015) 
Plot:  His father has died, he hasn't spoken with his brother for about 10 years and has a serious cancer....
Predicted Genre:  drama 
--------------------------------------------------
Title:  Er nu zhai (1955) 
Plot:  Before he was known internationally as a martial arts super