In [149]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

# Load the dataset
data = pd.read_csv('Wiki_Telugu_Movies_1930_1999.csv')

# Display first few rows to understand its structure
print(data.head())
print(data.info())
print(data.columns)

                     Title             Director  \
0  Barrister Parvateesarrn           R. Prakash   
1           Bhoja Kalidasu           H. V. Babu   
2               Bhookailas  Sundar Rao Nadkarni   
3             Bondam Pelli          H. M. Reddy   
4      Chaduvukunna Bharya          H. M. Reddy   

                           Production Music Composer  \
0            Motion Pictures Combines            NaN   
1                          Jaya Films            NaN   
2               Saraswathi Cine Films            NaN   
3  Madras United Artistes Corporation            NaN   
4  Madras United Artistes Corporation            NaN   

                                            Cast       Genre Release Date  \
0                                 Lanka Satyam,       Comedy         1940   
1  Adhanki Srirama Murthy, Parepalli Subba Rao,    Mythology         1940   
2                            M. V. Subbiah Naidu  Devotional         1940   
3                                   L. V. Prasad

In [120]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
try:
    data = pd.read_csv('Wiki_Telugu_Movies_1930_1999.csv')
except FileNotFoundError:
    print("File 'Wiki_Telugu_Movies_1930_1999.csv' not found. Please check the file path.")
    raise  # Raise error to stop execution if the file is not found

# Check if 'Year' and 'Genre' columns are in the DataFrame
if 'Year' not in data.columns or 'Genre' not in data.columns:
    raise ValueError("The required columns 'Year' and 'Genre' are missing from the dataset.")

# Drop rows with missing values in 'Year' or 'Genre' columns for simplicity
data = data[['Year', 'Genre']].dropna()

# Verify the shape of the data after dropping missing values
print(f"Data after dropping missing values: {data.shape}")

# Selecting features and target based on available columns
X = data[['Year']]  # Using 'Year' as the feature
y = data['Genre']   # Using 'Genre' as the target

# Encode the target variable ('Genre') if it's categorical
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale the numerical features (StandardScaler expects 2D input)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Verify the shape of the transformed data
print(f"Shape of X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}")
print(data.head())

Data after dropping missing values: (1082, 2)
Shape of X_train: (757, 1), X_val: (162, 1), X_test: (163, 1)
   Year       Genre
0  1940      Comedy
1  1940   Mythology
2  1940  Devotional
3  1940      Comedy
4  1940      Comedy


In [132]:
import torch
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create datasets
train_data = MovieDataset(X_train, y_train)
val_data = MovieDataset(X_val, y_val)
test_data = MovieDataset(X_test, y_test)

# DataLoaders
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16)
test_loader = DataLoader(test_data, batch_size=16)

# Example: Iterating through the data loader
for batch_X, batch_y in train_loader:
    print(batch_X.shape, batch_y.shape)

torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
torch.Size([16, 1]) torch.Size([16])
t

In [154]:
# Hyperparameters
input_size = 1  # 'Year' is the only input feature
hidden_size = 64  # Size of the hidden layer
output_size = len(label_encoder.classes_)  # Number of classes in 'Genre'
epochs = 30
learning_rate = 0.001

# Initialize the model, loss function, and optimizer
model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()  # CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()  # Zero gradients before backward pass
        y_pred = model(X_batch)  # Forward pass
        loss = criterion(y_pred, y_batch)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model weights
        train_loss += loss.item()  # Accumulate training loss

    # Validation phase
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():  # No gradient calculation during validation
        for X_batch, y_batch in val_loader:
            y_pred = model(X_batch)  # Forward pass
            loss = criterion(y_pred, y_batch)  # Compute loss
            val_loss += loss.item()  # Accumulate validation loss
            _, predicted = torch.max(y_pred, 1)  # Get predicted class
            total += y_batch.size(0)  # Total number of samples
            correct += (predicted == y_batch).sum().item()  # Correct predictions

    # Calculate average losses and accuracy
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    val_accuracy = 100 * correct / total

    # Print the progress
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")


Epoch 1/30, Train Loss: 4.4071, Val Loss: 3.9784, Val Accuracy: 24.69%
Epoch 2/30, Train Loss: 3.5857, Val Loss: 3.5389, Val Accuracy: 24.69%
Epoch 3/30, Train Loss: 3.1861, Val Loss: 3.3965, Val Accuracy: 22.84%
Epoch 4/30, Train Loss: 3.0321, Val Loss: 3.3495, Val Accuracy: 26.54%
Epoch 5/30, Train Loss: 2.9466, Val Loss: 3.3395, Val Accuracy: 27.16%
Epoch 6/30, Train Loss: 2.8945, Val Loss: 3.3344, Val Accuracy: 27.78%
Epoch 7/30, Train Loss: 2.8604, Val Loss: 3.3252, Val Accuracy: 27.78%
Epoch 8/30, Train Loss: 2.8262, Val Loss: 3.3340, Val Accuracy: 27.78%
Epoch 9/30, Train Loss: 2.8062, Val Loss: 3.3315, Val Accuracy: 27.78%
Epoch 10/30, Train Loss: 2.7659, Val Loss: 3.3410, Val Accuracy: 27.78%
Epoch 11/30, Train Loss: 2.7937, Val Loss: 3.3434, Val Accuracy: 32.72%
Epoch 12/30, Train Loss: 2.7443, Val Loss: 3.3469, Val Accuracy: 32.10%
Epoch 13/30, Train Loss: 2.7143, Val Loss: 3.3544, Val Accuracy: 32.72%
Epoch 14/30, Train Loss: 2.7187, Val Loss: 3.3594, Val Accuracy: 34.57%
E

In [156]:
# Test phase
model.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        test_loss += loss.item()
        _, predicted = torch.max(y_pred, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

# Calculate test accuracy
test_accuracy = 100 * correct / total
print(f"Test Loss: {test_loss/len(test_loader):.4f}, Test Accuracy: {test_accuracy:.2f}%")


Test Loss: 3.2752, Test Accuracy: 38.65%


In [158]:
# Example of tuning learning rate
for lr in [0.01, 0.001, 0.0001]:
    print(f"\nTraining with learning rate: {lr}")
    
    # Re-initialize the model for each learning rate to avoid carrying over weights from previous training
    model = SimpleNN(input_size, hidden_size, output_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # Training loop for this learning rate
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()  # Zero gradients before backward pass
            y_pred = model(X_batch)  # Forward pass
            loss = criterion(y_pred, y_batch)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model weights
            train_loss += loss.item()  # Accumulate training loss

        # Validation phase
        model.eval()  # Set model to evaluation mode
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():  # No gradient calculation during validation
            for X_batch, y_batch in val_loader:
                y_pred = model(X_batch)  # Forward pass
                loss = criterion(y_pred, y_batch)  # Compute loss
                val_loss += loss.item()  # Accumulate validation loss
                _, predicted = torch.max(y_pred, 1)  # Get predicted class
                total += y_batch.size(0)  # Total number of samples
                correct += (predicted == y_batch).sum().item()  # Correct predictions

        # Calculate average losses and accuracy
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total

        # Print the progress for this learning rate
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")



Training with learning rate: 0.01
Epoch 1/30, Train Loss: 3.5727, Val Loss: 3.3193, Val Accuracy: 24.69%
Epoch 2/30, Train Loss: 2.9124, Val Loss: 3.2889, Val Accuracy: 34.57%
Epoch 3/30, Train Loss: 2.8251, Val Loss: 3.4027, Val Accuracy: 34.57%
Epoch 4/30, Train Loss: 2.7849, Val Loss: 3.3953, Val Accuracy: 33.33%
Epoch 5/30, Train Loss: 2.7298, Val Loss: 3.3527, Val Accuracy: 32.72%
Epoch 6/30, Train Loss: 2.6603, Val Loss: 3.3945, Val Accuracy: 32.72%
Epoch 7/30, Train Loss: 2.6534, Val Loss: 3.3710, Val Accuracy: 34.57%
Epoch 8/30, Train Loss: 2.6284, Val Loss: 3.3742, Val Accuracy: 36.42%
Epoch 9/30, Train Loss: 2.5899, Val Loss: 3.4215, Val Accuracy: 35.80%
Epoch 10/30, Train Loss: 2.5485, Val Loss: 3.4336, Val Accuracy: 36.42%
Epoch 11/30, Train Loss: 2.5465, Val Loss: 3.4933, Val Accuracy: 34.57%
Epoch 12/30, Train Loss: 2.5687, Val Loss: 3.4915, Val Accuracy: 34.57%
Epoch 13/30, Train Loss: 2.5488, Val Loss: 3.5030, Val Accuracy: 30.86%
Epoch 14/30, Train Loss: 2.4995, Val L