# IMPORTING LIBRARIES

In [40]:
from requests import get #to send the request to the URL
from bs4 import BeautifulSoup #to get the content in the form of HTML
from time import sleep # to suspend the execution of the thread for a given number of seconds 
from random import randint # to return a random integer
import pandas as pd  #to create dataframe

# SCRAPING 

In [41]:
#initializing empty lists so that we can append the data overall
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
metascores = []
votes = []
# we use this loop to scrape multiple pages:
for page in range(1,250,50):
  
   #get request for sci-fi
   response = get("https://www.imdb.com/search/title?genres=sci-fi&"
                  + "start="
                  + str(page)
                  + "&explore=title_type,genres&ref_=adv_prv")
   #ref Takes us to the the next or the previous page. 
   #The reference is the page we are currently on. adv_nxt and adv_prv are two possible values. 
   #They translate to advance to the next page, and advance to the previous page, respectively.

  # pause the loop a random number of seconds (between 8 and 15)
   sleep(randint(8,15))
   
 

   #parse the content of current iteration of request
   page_html = BeautifulSoup(response.text, 'html.parser')
      
   movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
  
   # we use this for loop to extract the 50 movies for that page
   for container in movie_containers:

    # to skip movies without a Metascore
       if container.find('div', class_ = 'ratings-metascore') is not None:

           
           title = container.h3.a.text
           titles.append(title)

           if container.h3.find('span', class_= 'lister-item-year text-muted unbold') is not None:
            
            
             year = container.h3.find('span', class_= 'lister-item-year text-muted unbold').text.replace('(', '').replace(')', '')
             # remove the parentheses and make it an integer
             years.append(year)

           else:
             years.append("") 
          # to replace every type None data with an empty string so the arrays are of the same length at the end of the scraping    
           if container.p.find('span', class_ = 'certificate') is not None:
            
             
             rating = container.p.find('span', class_= 'certificate').text
             ratings.append(rating)

           else:
             ratings.append("")

           if container.p.find('span', class_ = 'genre') is not None:
            
             
             genre = container.p.find('span', class_ = 'genre').text.replace("\n", "") 
            # remove the whitespace
             genres.append(genre)
          
           else:
             genres.append("")

           if container.p.find('span', class_ = 'runtime') is not None:

            
             time = int(container.p.find('span', class_ = 'runtime').text.replace(" min", "")) 
             # remove the word min from the runtime and make it an integer
             runtimes.append(time)

           else:
             runtimes.append("")

           if float(container.strong.text) is not None:

             
             imdb = float(container.strong.text) 
             imdb_ratings.append(imdb)

           else:
             imdb_ratings.append("")

           if container.find('span', class_ = 'metascore').text is not None:

             
             m_score = int(container.find('span', class_ = 'metascore').text) 
             metascores.append(m_score)

           else:
             metascores.append("")

           if container.find('span', attrs = {'name':'nv'})['data-value'] is not None:

            
             vote = int(container.find('span', attrs = {'name':'nv'})['data-value'])
             votes.append(vote)

           else:
               votes.append("")

           

In [42]:
# creation of the dataframe
final_df = pd.DataFrame({'movie': titles,
                      'year': years,
                      'rating': ratings,
                      'genre': genres,
                      'runtime_min': runtimes,
                      'imdb': imdb_ratings,
                      'metascore': metascores,
                      'votes': votes}
                      )

In [43]:
final_df

Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,votes
0,Black Panther: Wakanda Forever,2022,PG-13,"Action, Adventure, Drama",161,7.3,67,87694
1,Black Adam,2022,PG-13,"Action, Adventure, Fantasy",125,6.9,41,129512
2,Black Panther,2018,PG-13,"Action, Adventure, Sci-Fi",134,7.3,88,763508
3,Nope,2022,R,"Horror, Mystery, Sci-Fi",130,6.9,77,160456
4,Everything Everywhere All at Once,2022,R,"Action, Adventure, Comedy",139,8.1,81,239261
...,...,...,...,...,...,...,...,...
126,Weird Science,1985,,"Comedy, Romance, Sci-Fi",94,6.6,46,90096
127,Jurassic World,2015,PG-13,"Action, Adventure, Sci-Fi",124,6.9,59,642714
128,Doctor Strange,2016,PG-13,"Action, Adventure, Fantasy",115,7.5,72,739962
129,Alita: Battle Angel,2019,PG-13,"Action, Adventure, Sci-Fi",122,7.3,53,268011


# Scraping Movie Reviews 

In [76]:
response = get("https://www.imdb.com/title/tt5108870/reviews?ref_=tt_urv")
page_html_1 = BeautifulSoup(response.text, 'html.parser')
reviews=[]
review_containers = page_html_1.find_all('div',class_="lister-item-content")


for container in review_containers:
  try:
    review = container.find('div',class_="text show-more__control").text
  except (AttributeError) as err:
    review = container.find('div',class_="text show-more__control clickable").text
  reviews.append(review)

In [81]:
final_df_1 = pd.DataFrame({"reviews":reviews})

In [82]:
final_df_1

Unnamed: 0,reviews
0,Movies like Venom and Morbius need to be Rated...
1,Morbius is one of those rare films that pretty...
2,"""Morbius"" is an Action - Adventure movie in wh..."
3,I have to admit that I had really been looking...
4,"Entertaining and very atmospheric, Morbius man..."
5,Morbius truly has to be one of the films relea...
6,6/10 - I don't know what all the fuss is about...
7,Dangerously ill with a rare blood disorder and...
8,This is an early review. There are no plot det...
9,The pacing is tight and the action was awesome...


# Sentiment Analysis Using Vader

In [83]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [84]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Farah\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [85]:
sent=SentimentIntensityAnalyzer()

In [86]:
score_comp=[]
score_pos=[]
score_neg=[]

for i in range(0,final_df_1.shape[0]):
    score=sent.polarity_scores(final_df_1.iloc[i][0])
    score_1=score['compound']
    score_comp.append(score_1)
    score_2=score['pos']
    score_pos.append(score_2)
    score_2=score['neg']
    score_neg.append(score_2)

        

In [87]:
final_df_1['compound_score']=score_comp
final_df_1['positive_score']=score_pos
final_df_1['negative_score']= score_neg

In [88]:
final_df_1

Unnamed: 0,reviews,compound_score,positive_score,negative_score
0,Movies like Venom and Morbius need to be Rated...,0.9936,0.207,0.131
1,Morbius is one of those rare films that pretty...,-0.9582,0.133,0.204
2,"""Morbius"" is an Action - Adventure movie in wh...",0.9672,0.182,0.079
3,I have to admit that I had really been looking...,0.9952,0.177,0.058
4,"Entertaining and very atmospheric, Morbius man...",0.9623,0.147,0.101
5,Morbius truly has to be one of the films relea...,-0.296,0.103,0.092
6,6/10 - I don't know what all the fuss is about...,-0.7405,0.0,0.106
7,Dangerously ill with a rare blood disorder and...,0.9896,0.138,0.063
8,This is an early review. There are no plot det...,-0.9807,0.049,0.137
9,The pacing is tight and the action was awesome...,0.9337,0.203,0.089


In [89]:
final_df_1['compound_score'].mean()

0.4029879999999999