In [33]:
import requests
import pandas as pd

In [34]:
def movies_url(no):
    return f"https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page={no}"

In [35]:
def get_genres(ids):
    genres = [{"id":28,"name":"Action"},{"id":12,"name":"Adventure"},{"id":16,"name":"Animation"},{"id":35,"name":"Comedy"},{"id":80,"name":"Crime"},
          {"id":99,"name":"Documentary"},{"id":18,"name":"Drama"},{"id":10751,"name":"Family"},{"id":14,"name":"Fantasy"},{"id":36,"name":"History"},
          {"id":27,"name":"Horror"},{"id":10402,"name":"Music"},{"id":9648,"name":"Mystery"},{"id":10749,"name":"Romance"},{"id":878,"name":"Science Fiction"},
          {"id":10770,"name":"TV Movie"},{"id":53,"name":"Thriller"},{"id":10752,"name":"War"},{"id":37,"name":"Western"}]
    gen = ""
    for i in genres:
        if i['id'] in ids:
            gen += f"{i['name']}, "
    return gen.strip()[:-1]

In [36]:
df = pd.DataFrame(columns=['name','desc','genre'])

In [37]:
for i in range(1, 451): #451
    headers = {"accept": "application/json"}
    response = requests.get(movies_url(i), headers=headers)
    res = response.json()
    for movies in res['results']:
        df.loc[len(df.index)] = [movies['original_title'], movies['overview'], get_genres(movies['genre_ids'])]

In [47]:
df['desc']

0       Spanning  years 1945  1955  chronicle   fictio...
1       Framed   1940s   double murder   wife   lover ...
2       In  continuing saga   Corleone crime family  y...
3       The true story   businessman Oskar Schindler s...
4       Raj   rich carefree happygolucky second genera...
                              ...                        
8995    Luke Skywalker  Han Solo battle evil Imperial ...
8996    The filmmaking team behind  hits Scary Movie D...
8997    Edward Carnby   private investigator specializ...
8998    In  year 3000 man   match   Psychlos  greedy m...
8999    Set   island   coast  techno rave party attrac...
Name: desc, Length: 9000, dtype: object

### html tags

In [1]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [None]:
df['desc'] = df['desc'].apply(remove_html_tags)

### Remove url

In [2]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [None]:
df['desc'] = df['desc'].apply(remove_url)

### Remove punc

In [40]:
import string,time
exclude = string.punctuation
def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [41]:
df['desc'] = df['desc'].apply(remove_punc)

### Chat conversion

In [5]:
chat_words = []
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [None]:
df['desc'] = df['desc'].apply(chat_conversion)

### Correct text

In [43]:
from textblob import TextBlob

def correct_text(text):
    textBlb = TextBlob(text)
    return textBlb.correct().string

In [None]:
df['desc'] = df['desc'].apply(correct_text)

### Remove stopwords

In [45]:
from nltk.corpus import stopwords
def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [46]:
df['desc'] = df['desc'].apply(remove_stopwords)

### Remove emoji

In [10]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
df['desc'] = df['desc'].apply(remove_emoji)

### Tokenization

In [11]:
from nltk.tokenize import word_tokenize, sent_tokenize

##### Word tokenization

In [None]:
df['desc'] = df['desc'].apply(word_tokenize)

##### Sentence tokenization

In [None]:
df['desc'] = df['desc'].apply(sent_tokenize)

### Stemming words

In [12]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [None]:
df['desc'] = df['desc'].apply(stem_words)

### Lemmatizer word

In [48]:
import nltk
# nltk.download('all')
from nltk.stem import WordNetLemmatizer

def lemmatizer_word(text):
    # punctuations = "?:!.,;"
    # sentence_words = nltk.word_tokenize(text)
    # print(sentence_words)
    # for word in sentence_words:
    #     if word in punctuations:
    #         sentence_words.remove(word)
    wordnet_lemmatizer = WordNetLemmatizer()
    return " ".join([wordnet_lemmatizer.lemmatize(word,pos='v') for word in text.split()])

In [49]:
df['desc'] = df['desc'].apply(lemmatizer_word)

In [50]:
df.to_csv('clean_movies_text_dataset.csv', header=False, index=False)