<a href="https://colab.research.google.com/github/desaivishwas/D590_Project/blob/main/DV_project_back2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML(
      '''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
            },
          });
        </script>
        '''
))

# Indian Cinema through Data

`The goal of this visualization is to analyze various aspects that contribute to the success of a film. A commercial picture can not only entertain the masses but also make a lot of money for the creators. A good director, excellent actors, production house, technicians such as editors/cinematographers, and the timing of the movie's release are all key factors in determining whether or not a film will make money. Indian cinema, one of the world’s oldest cinemas is a broad term that refers to a variety of film industries in India, which are mostly split by languages and regions. The Hindi film industry, popularly known as Bollywood, will be our primary emphasis. We hope to visually explore what makes a Bollywood film successful as well as provide a brief overview of Indian cinema with this project.`



In [None]:
configure_plotly_browser_state()
import pandas as pd
import plotly.express as px
import seaborn  as sns
import matplotlib.pyplot as plt
import wordcloud
import datetime
from IPython.display import HTML
# import plotly.io as pio
# pio.renderers.default='colab'

In [None]:
movies = pd.read_csv('/content/bollywood_full.csv')
movies.info()

In [None]:
movies.head()

In [None]:
movies = movies.drop(columns=['poster_path', 'wiki_link', 'summary', 'tagline', 'title_x', 'title_y', 'story', 'release_date'])

In [None]:
movies = movies.rename(columns={"original_title":"title", "wins_nominations": "awards", "year_of_release":"year"})

In [None]:
# movies['year'] = movies['year'].replace(r'\r+|\n+|\t+','', regex=True)

In [None]:
# movies['year'].str.count('N').sum()
movies['year'] = movies['year'].replace('N','', regex=True)
movies = movies.replace(r'\\','', regex=True)
movies = movies.replace('N','', regex=True)

In [None]:
movies.head()

## Checking for null values in the dataset

In [None]:
movies.isnull().sum()

### Replacing the null vlaues with 0

In [None]:
movies = movies.fillna(0)

In [None]:
# checking again for null values
movies.isnull().sum()

In [None]:
 counts = movies['year'].value_counts().to_frame("counts")

In [None]:
counts = counts.rename_axis("year", axis="columns")
counts.sort_index().head()

In [None]:
ax = counts.sort_index().plot(kind='line', figsize = (20, 20), title='# of movies per year from 1950-2019', c='red')
ax.set_xlabel("Years")
ax.set_ylabel("# of moviees released")

## WordCloud for movie genres

In [None]:
unique_genres = []
for i in range(len(movies)):
    genres = movies.at[i, 'genres']
    for genre in genres.split('|'):
        if genre not in unique_genres:
            unique_genres.append(genre)

In [None]:
movie_df = movies.copy()
total = len(movies)
for genre in unique_genres:
    movie_df[genre] = [0] * total

for i in range(len(movies)):
    genres = movie_df.at[i, 'genres']
    for genre in genres.split("|"):
        movie_df.at[i, genre] = 1

In [None]:
movie_df.head(2)

In [None]:
from wordcloud import WordCloud

# plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='black', width=600, height=300, max_font_size=150, 
                      max_words=200).generate(str(unique_genres))
wordcloud.recolor(random_state=0)
plt.figure(figsize = (14, 10),facecolor='k')
# plt.title("Wordcloud for gneres", fontsize=30)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

## Conclusion

## Top 25 highest rated movies in the data

In [None]:
ratings_df = movies.sort_values(by= 'imdb_rating', ascending=False)

In [None]:
plt.figure(figsize=(22,12))
sns.histplot(data=ratings_df, x='imdb_rating', color='C2')

In [None]:
top10_df = ratings_df[:10]
top10_df.plot(x="title", y="imdb_rating", kind="barh", figsize=(20, 10), color='C1')

In [None]:
runtime_df = movies.copy()
runtime_df['runtime'] = pd.to_numeric(runtime_df['runtime'], errors='raise')
# runtime_df = movies.sort_values('runtime', ascending=False)

In [None]:
runtime_df = runtime_df.drop_duplicates()

In [None]:
runtime_df.head()

In [None]:
# ax =runtime_df['runtime'].histplot(kind='hist',bins=10)
plt.figure(figsize=(22,12))
sns.histplot(data=runtime_df, x='runtime', color='C5')

In [None]:
configure_plotly_browser_state()
top10_runtime = runtime_df[:10]
plt.figure(figsize=(22,12))
# sns.barplot(x="title", y="runtime",data= top10_runtime, palette="Blues_d",hue="runtime")
fig = px.bar(top10_runtime, y="runtime",x="title",title="Top 10 longest movies in Bollywod ",color="title", labels={'title':'Movie Title', 'runtime':'Movie runtime (in Mins)'})
HTML(fig.to_html())
# fig.show(renderer="colab")

In [None]:
fig.show()

### Coclusion

## Failed experiment
Write About imdb api - bufge / boxoffice

## Looking for other sources of data

### About the dataset

## Bollywood Box Office (2017-2020) from kagagle


#### notes: Bias will be there, less date, does not give a compelt picture, improvemnts, etc
## maybe conver INR to dollar for safety

In [None]:
data = pd.read_csv('/content/bollywood_box_clean.csv')
data.head(3)

In [None]:
import plotly.express as px
configure_plotly_browser_state()
high_earning=data.sort_values('movie_total_worldwide', ascending = False)
high_earning=high_earning.head(10)
fig =px.bar(high_earning, y="movie_total_worldwide",x="movie_name",title="Top 10 Highest Box Office Collections ",color="movie_name", labels={'movie_name':"movies", "movie_total_worldwide":"Gross Worldwide"})
HTML(fig.to_html())


In [None]:
fig.show()

## genre liked by the audience

In [None]:
topgenre = data.groupby(["movie_genre"])["movie_total_worldwide"].sum().reset_index()
configure_plotly_browser_state()
fig = px.scatter(topgenre,'movie_genre','movie_total_worldwide',size="movie_total_worldwide",color="movie_genre",title="Which Genre is most liked by Audience")
HTML(fig.to_html())

In [None]:
fig.show()

In [None]:
configure_plotly_browser_state()
fig = px.pie(data,'release_year','movie_total_worldwide',title="In Which Year Most Profit Collected")
HTML(fig.to_html())

In [None]:
fig.show()

In [None]:
configure_plotly_browser_state()

topdir = data.groupby(["movie_director"])["movie_total_worldwide"].sum().reset_index()
topdir=topdir.sort_values("movie_total_worldwide",ascending=False)
topdir=topdir.head(10)
fig = px.bar(topdir,"movie_director","movie_total_worldwide",title="Top 10 Directors With Highest Movie Earnings ",color="movie_director")
HTML(fig.to_html())

In [None]:
fig.show()

In [None]:
configure_plotly_browser_state()

fig = px.scatter(data, x="runtime", y="movie_total_worldwide", color="release_year",
                 size='movie_total_worldwide')
HTML(fig.to_html())

In [None]:
fig.show()

In [None]:
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul",
          "Aug", "Sept", "Oct", "Nov", "Dec"]
sns.set_style("darkgrid")
plt.style.use("fivethirtyeight")
plt.figure(figsize = (16, 8))
sns.boxplot(x = "release_month", y = "movie_total", data = data, order = months)
plt.ylabel("Crores INR")
plt.xlabel("Month")
plt.title("Domestic Gross of Indian Movies by Month")

In [None]:
sns.set_style("darkgrid")
plt.style.use("fivethirtyeight")
plt.figure(figsize = (16, 8))
sns.boxplot(x = "release_month", y = "movie_total_worldwide", data = data, order = months)
plt.ylabel("Crores INR")
plt.xlabel("Month")
plt.title("Worldwide Gross of Indian Movies by Month")