In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer,  GenerationConfig
import torch
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report,r2_score



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
path = "/kaggle/input/dataset/imdb_top_1000.csv"
data = pd.read_csv(path)

In [4]:
data.head(4)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000


In [5]:
data=data.dropna()

In [6]:
data['Released_Year'] = pd.to_numeric(data['Released_Year'], errors='coerce')

data = data.dropna(subset=['Released_Year'])

data['Released_Year'] = data['Released_Year'].astype(int)

data['Certificate'].unique()

array(['A', 'UA', 'U', 'R', 'G', 'PG-13', 'PG', 'Passed', 'Approved',
       'TV-PG', 'U/A', 'GP'], dtype=object)

In [7]:
certificate_mapping = {
    'A': 'R',
    'UA': 'PG-13',
    'U/A': 'PG-13',
    'U': 'G',
    'R': 'R',
    'G': 'G',
    'PG-13': 'PG-13',
    'PG': 'PG',
    'Passed': 'PG',
    'Approved': 'PG',
    'TV-PG': 'PG',
    'GP': 'PG'
}

data['Certificate'] = data['Certificate'].map(certificate_mapping)
data['Certificate'].unique()

array(['R', 'PG-13', 'G', 'PG'], dtype=object)

In [8]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [9]:
from huggingface_hub import login

# Replace 'your_hf_token' with your actual Hugging Face token
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, output_hidden_states=True)

Downloading shards: 100%|██████████| 4/4 [01:15<00:00, 18.83s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]


In [11]:
prompts_year=[]
prompts_certificate=[]

In [12]:
for _, row in data.iterrows():
    prompt_year = f"What is the realese year of the movie'{row['Series_Title']}'?"
    prompt_certificate = f"What is the certificate rating  of the movie'{row['Series_Title']}'?"
    prompts_year.append(prompt_year)
    prompts_certificate.append(prompt_certificate)


In [13]:
embeddings_r_final = []
embeddings_r_first = []
embeddings_r_mid = []

for prompt in prompts_year:
    inputs = tokenizer(prompt, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**inputs)

    hidden_states = outputs.hidden_states

    final_hidden_state = hidden_states[-1] 
    first_hidden_state = hidden_states[0]
    length1 = len(hidden_states)//2
    mid_hidden_state = hidden_states[length1]

    final_token_embedding = final_hidden_state[:, -1, :]
    first_token_embedding = first_hidden_state[:, -1, :]
    mid_token_embedding = mid_hidden_state[:, -1, :]

    embeddings_r_final.append(final_token_embedding)
    embeddings_r_first.append(first_token_embedding) 
    embeddings_r_mid.append(mid_token_embedding)

print(length1)



We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


16


In [14]:
def convert(embeddings):
    return np.vstack([e.numpy() for e in embeddings])

In [15]:
embeddings_c_final = []
embeddings_c_first = []
embeddings_c_mid = []

for prompt in prompts_certificate:
    inputs = tokenizer(prompt, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**inputs)

    hidden_states = outputs.hidden_states

    final_hidden_state = hidden_states[-1]
    first_hidden_state = hidden_states[0]
    length1 = len(hidden_states)//2
    mid_hidden_state = hidden_states[length1]

    final_token_embedding = final_hidden_state[:, -1, :]
    first_token_embedding = first_hidden_state[:, -1, :]
    mid_token_embedding = mid_hidden_state[:, -1, :]

    embeddings_c_final.append(final_token_embedding)
    embeddings_c_first.append(first_token_embedding)
    embeddings_c_mid.append(mid_token_embedding)


In [16]:
embeddings_c_final=convert(embeddings_c_final)
embeddings_c_first=convert(embeddings_c_first)
embeddings_c_mid=convert(embeddings_c_mid)
embeddings_r_final=convert(embeddings_r_final)
embeddings_r_first=convert(embeddings_r_first)
embeddings_r_mid=convert(embeddings_r_mid)

In [17]:
years=[]
for i in data['Released_Year']:
    years.append(i)

certificate=[]
for i in data['Certificate']:
    certificate.append(i)


In [18]:
def Regression(Embeddings, Target):
    X_train, X_test, y_train_reg, y_test_reg = train_test_split(Embeddings, Target, test_size=0.2, random_state=42)

    regressor = LinearRegression()
    regressor.fit(X_train, y_train_reg)
    y_pred_reg = regressor.predict(X_test)
    
    regression_r2 =  r2_score(y_test_reg,y_pred_reg)
    print("Regression R² Score:", regression_r2)


In [19]:
def Classifier(Embeddings, Target):
    X_train, X_test, y_train_reg, y_test_reg = train_test_split(Embeddings, Target, test_size=0.2, random_state=42)

    classifier = LogisticRegression(multi_class='multinomial',max_iter=1000)
    classifier.fit(X_train, y_train_reg)
    y_pred_reg = classifier.predict(X_test)

    print(f"Accuracy:{classifier.score(X_test,y_test_reg)}")


In [20]:
Regression(embeddings_r_final,years)
Regression(embeddings_r_mid,years)
Regression(embeddings_r_first,years)

Regression R² Score: 0.7555598020553589
Regression R² Score: 0.8221951723098755
Regression R² Score: -0.05106854438781738


In [21]:
Classifier(embeddings_c_final,certificate)
Classifier(embeddings_c_mid,certificate)
Classifier(embeddings_c_first,certificate)



Accuracy:0.6923076923076923




Accuracy:0.6993006993006993




Accuracy:0.4405594405594406
