In [None]:
import requests

In [None]:
# Making the API request
endpoint_movie = 'https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page='
endpoint_genre = 'https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US'

In [None]:
movies=[]
for i in range (1,472):
  response = requests.get(endpoint_movie+str(i))
  if response.status_code == 200:
    # Parsing the JSON response
    data = response.json()
    for movie in data['results']:
      movie_info={'ID':movie['id'], 'Movie Name':movie['original_title'], 'Overview':movie['overview'], 'Genre':movie['genre_ids']}
      movies.append(movie_info)
  else:
    print(f'Error: {response.status_code}')

print(movies)

In [None]:
response = requests.get(endpoint_genre)
if response.status_code == 200:
  # Parsing the JSON response
  data = response.json()
  genres = data['genres']
else:
  print(f'Error: {response.status_code}')

print(genres)

[{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}, {'id': 99, 'name': 'Documentary'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 14, 'name': 'Fantasy'}, {'id': 36, 'name': 'History'}, {'id': 27, 'name': 'Horror'}, {'id': 10402, 'name': 'Music'}, {'id': 9648, 'name': 'Mystery'}, {'id': 10749, 'name': 'Romance'}, {'id': 878, 'name': 'Science Fiction'}, {'id': 10770, 'name': 'TV Movie'}, {'id': 53, 'name': 'Thriller'}, {'id': 10752, 'name': 'War'}, {'id': 37, 'name': 'Western'}]


In [None]:
# Step 1: Creating a mapping of genre IDs to genre names (a dictionary)
genre_mapping = {genre['id']: genre['name'] for genre in genres}

# Step 2: Updating the movies list with genre names
for movie in movies:
    movie['Genre'] = [genre_mapping[genre_id] for genre_id in movie['Genre']]

# Print the updated movies list
print(movies)

In [None]:
# Creating and saving dataframe as csv
import pandas as pd

df = pd.DataFrame(movies)

df.to_csv('movies.csv',index=False)

In [None]:
df = pd.read_csv('movies.csv') #Reading saved csv file

In [None]:
df.head()

Unnamed: 0,ID,Movie Name,Overview,Genre
0,278,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']"
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","['Drama', 'Crime']"
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"['Drama', 'Crime']"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"['Drama', 'History', 'War']"
4,389,12 Angry Men,The defense and the prosecution have rested an...,['Drama']


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9420 entries, 0 to 9419
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          9420 non-null   int64 
 1   Movie Name  9420 non-null   object
 2   Overview    9419 non-null   object
 3   Genre       9420 non-null   object
dtypes: int64(1), object(3)
memory usage: 294.5+ KB


In [None]:
# Deleting row with null value in one column in a df

df = df.dropna(subset=['Overview'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9419 entries, 0 to 9419
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          9419 non-null   int64 
 1   Movie Name  9419 non-null   object
 2   Overview    9419 non-null   object
 3   Genre       9419 non-null   object
dtypes: int64(1), object(3)
memory usage: 367.9+ KB


In [None]:
# converting col into str in df

df['Overview'] = df['Overview'].astype(str)
df.dtypes

ID             int64
Movie Name    object
Overview      object
Genre         object
dtype: object

In [None]:
# Text Cleaning
import re #Regular expressions library

import string

# Function to clean text by removing URLs, punctuation, and converting to lowercase
def clean_text(text):
    # Removing URLs
    cleaned_text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Removing punctuation
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation))

    # Converting cleaned_text to lowercase
    cleaned_text = cleaned_text.lower()

    return cleaned_text

# Apply the cleaning function to 'message to examine'
df['cleaned_text'] = df['Overview'].apply(clean_text)

# Display the first few rows to check the cleaned text
df[['Overview', 'cleaned_text']].head()

Unnamed: 0,Overview,cleaned_text
0,Imprisoned in the 1940s for the double murder ...,imprisoned in the 1940s for the double murder ...
1,"Spanning the years 1945 to 1955, a chronicle o...",spanning the years 1945 to 1955 a chronicle of...
2,In the continuing saga of the Corleone crime f...,in the continuing saga of the corleone crime f...
3,The true story of how businessman Oskar Schind...,the true story of how businessman oskar schind...
4,The defense and the prosecution have rested an...,the defense and the prosecution have rested an...


In [None]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize

# Downloading the tokenizer model
nltk.download('punkt')

# Function to tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Applying tokenization to the cleaned text
df['tokens'] = df['cleaned_text'].apply(tokenize_text)

# Displaying the first few rows to check the tokenization results
df[['cleaned_text', 'tokens']].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,cleaned_text,tokens
0,imprisoned in the 1940s for the double murder ...,"[imprisoned, in, the, 1940s, for, the, double,..."
1,spanning the years 1945 to 1955 a chronicle of...,"[spanning, the, years, 1945, to, 1955, a, chro..."
2,in the continuing saga of the corleone crime f...,"[in, the, continuing, saga, of, the, corleone,..."
3,the true story of how businessman oskar schind...,"[the, true, story, of, how, businessman, oskar..."
4,the defense and the prosecution have rested an...,"[the, defense, and, the, prosecution, have, re..."


In [None]:
# Removing Stop words
from nltk.corpus import stopwords

# Downloading the list of stop words
nltk.download('stopwords')

# Set of English stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Function to remove stop words
def remove_stop_words(tokens):
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# Apply the function to remove stop words from the tokens
df['filtered_tokens'] = df['tokens'].apply(remove_stop_words)

# Display the first few rows to check the results
df[['tokens', 'filtered_tokens']].head()


Unnamed: 0,tokens,filtered_tokens
0,"[imprisoned, in, the, 1940s, for, the, double,...","[imprisoned, 1940s, double, murder, wife, love..."
1,"[spanning, the, years, 1945, to, 1955, a, chro...","[spanning, years, 1945, 1955, chronicle, ficti..."
2,"[in, the, continuing, saga, of, the, corleone,...","[continuing, saga, corleone, crime, family, yo..."
3,"[the, true, story, of, how, businessman, oskar...","[true, story, businessman, oskar, schindler, s..."
4,"[the, defense, and, the, prosecution, have, re...","[defense, prosecution, rested, jury, filing, j..."


In [None]:
# Stemming
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Function to stem tokens
def stem_tokens(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# Apply stemming to the filtered tokens
df['stemmed_tokens'] = df['filtered_tokens'].apply(stem_tokens)

# Display the first few rows to check the stemmed results
df[['filtered_tokens', 'stemmed_tokens']].head()

Unnamed: 0,filtered_tokens,stemmed_tokens
0,"[imprisoned, 1940s, double, murder, wife, love...","[imprison, 1940, doubl, murder, wife, lover, u..."
1,"[spanning, years, 1945, 1955, chronicle, ficti...","[span, year, 1945, 1955, chronicl, fiction, it..."
2,"[continuing, saga, corleone, crime, family, yo...","[continu, saga, corleon, crime, famili, young,..."
3,"[true, story, businessman, oskar, schindler, s...","[true, stori, businessman, oskar, schindler, s..."
4,"[defense, prosecution, rested, jury, filing, j...","[defens, prosecut, rest, juri, file, juri, roo..."


In [None]:
# Lemmatization
from nltk.stem import WordNetLemmatizer

# Downloading the WordNet resource
nltk.download('wordnet')

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Apply lemmatization to the filtered tokens
df['lemmatized_tokens'] = df['filtered_tokens'].apply(lemmatize_tokens)

# Display the first few rows to check the lemmatized results
df[['filtered_tokens', 'lemmatized_tokens']].head()

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,filtered_tokens,lemmatized_tokens
0,"[imprisoned, 1940s, double, murder, wife, love...","[imprisoned, 1940s, double, murder, wife, love..."
1,"[spanning, years, 1945, 1955, chronicle, ficti...","[spanning, year, 1945, 1955, chronicle, fictio..."
2,"[continuing, saga, corleone, crime, family, yo...","[continuing, saga, corleone, crime, family, yo..."
3,"[true, story, businessman, oskar, schindler, s...","[true, story, businessman, oskar, schindler, s..."
4,"[defense, prosecution, rested, jury, filing, j...","[defense, prosecution, rested, jury, filing, j..."


In [None]:
df[['filtered_tokens', 'stemmed_tokens']].head(20)

Unnamed: 0,filtered_tokens,stemmed_tokens
0,"[imprisoned, 1940s, double, murder, wife, love...","[imprison, 1940, doubl, murder, wife, lover, u..."
1,"[spanning, years, 1945, 1955, chronicle, ficti...","[span, year, 1945, 1955, chronicl, fiction, it..."
2,"[continuing, saga, corleone, crime, family, yo...","[continu, saga, corleon, crime, famili, young,..."
3,"[true, story, businessman, oskar, schindler, s...","[true, stori, businessman, oskar, schindler, s..."
4,"[defense, prosecution, rested, jury, filing, j...","[defens, prosecut, rest, juri, file, juri, roo..."
5,"[raj, rich, carefree, happygolucky, second, ge...","[raj, rich, carefre, happygolucki, second, gen..."
6,"[young, girl, chihiro, becomes, trapped, stran...","[young, girl, chihiro, becom, trap, strang, ne..."
7,"[batman, raises, stakes, war, crime, help, lt,...","[batman, rais, stake, war, crime, help, lt, ji..."
8,"[unemployed, kitaeks, family, takes, peculiar,...","[unemploy, kitaek, famili, take, peculiar, int..."
9,"[supernatural, tale, set, death, row, southern...","[supernatur, tale, set, death, row, southern, ..."


In [None]:
# Sentence Segmentation
from nltk.tokenize import sent_tokenize

# Function to segment text into sentences
def segment_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

# Apply sentence segmentation to the cleaned text
df['sentences'] = df['cleaned_text'].apply(segment_sentences)

# Display the first few rows to check the sentence segmentation results
df[['cleaned_text', 'sentences']].head()

Unnamed: 0,cleaned_text,sentences
0,imprisoned in the 1940s for the double murder ...,[imprisoned in the 1940s for the double murder...
1,spanning the years 1945 to 1955 a chronicle of...,[spanning the years 1945 to 1955 a chronicle o...
2,in the continuing saga of the corleone crime f...,[in the continuing saga of the corleone crime ...
3,the true story of how businessman oskar schind...,[the true story of how businessman oskar schin...
4,the defense and the prosecution have rested an...,[the defense and the prosecution have rested a...
