In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [2]:
ratings_single_account = pd.read_csv('data/ratings_single_account.csv')
google_trends_fake_rating_months = pd.read_csv('data/google_trends_fake_rating.csv')
all_ratings = pd.read_csv('data/all_ratings.csv')

In [3]:
ratings_single_account.head()

Unnamed: 0,rating_id,userId,movieId,rating,rating_date
0,19834829,202382,1,5.0,1997-05-20 06:50:22
1,25102792,256349,1,5.0,2017-08-24 03:49:46
2,15849334,161826,1,3.0,2016-12-17 15:13:43
3,21184239,216433,1,5.0,2015-11-19 04:07:21
4,4464337,45860,1,5.0,2001-11-24 12:06:59


In [4]:
len(ratings_single_account)

5620

In [5]:
google_trends_fake_rating_months.head()

Unnamed: 0,Month,fake_rating
0,2004-01,0
1,2004-02,0
2,2004-03,0
3,2004-04,0
4,2004-05,0


In [6]:
len(google_trends_fake_rating_months)

229

In [7]:
def calcluate_rating_amount_grouped_by_year(ratings_df, row_name, key=None):
    year_ratings_dict = {}
    
    year_column = ratings_df[row_name].astype(str).str[:4].astype(int)
    
    if key is None:
        year_ratings_dict = year_column.value_counts().to_dict()
    else:
        year_ratings_dict = year_column.to_frame().join(ratings_df[key]).groupby(row_name).sum().to_dict()[key]
    
    year_ratings = pd.DataFrame.from_dict(year_ratings_dict, orient='index', columns=['rating_amount'])
    year_ratings.index.name = 'year'
    year_ratings = year_ratings.reset_index()
    
    return year_ratings

In [8]:
df_single_user_ratings_binned_in_years = calcluate_rating_amount_grouped_by_year(ratings_single_account, 'rating_date')

In [9]:
single_user_ratings_binned_in_years_reduced = df_single_user_ratings_binned_in_years.loc[(df_single_user_ratings_binned_in_years['year'] >= 2011) & (df_single_user_ratings_binned_in_years['year'] < 2018)]

In [10]:
single_user_ratings_binned_in_years_reduced['rating_amount'].sum()

2772

In [11]:
single_user_ratings_binned_in_years_reduced['rating_amount'] = (single_user_ratings_binned_in_years_reduced['rating_amount'] / single_user_ratings_binned_in_years_reduced['rating_amount'].sum() * 100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_user_ratings_binned_in_years_reduced['rating_amount'] = (single_user_ratings_binned_in_years_reduced['rating_amount'] / single_user_ratings_binned_in_years_reduced['rating_amount'].sum() * 100)


In [12]:
single_user_ratings_binned_in_years_reduced.head()

Unnamed: 0,year,rating_amount
0,2017,20.851371
2,2016,18.542569
3,2015,18.001443
6,2014,11.002886
7,2011,11.002886


In [13]:
fig = go.Figure(data=[
    go.Bar(name='Ratings from users who rated only one movie', x=single_user_ratings_binned_in_years_reduced['year'], y=single_user_ratings_binned_in_years_reduced['rating_amount'])
    ])

fig.update_xaxes(tickvals=single_user_ratings_binned_in_years_reduced['year'])
    
fig.update_layout(title_text=f'Singel user ratings between 2011 and 2017')
fig.update_layout(barmode='group')  
fig.show()

In [14]:
google_trends_fake_rating_years = calcluate_rating_amount_grouped_by_year(google_trends_fake_rating_months, 'Month', 'fake_rating')
google_trends_fake_rating_years.head()

Unnamed: 0,year,rating_amount
0,2004,143
1,2005,163
2,2006,111
3,2007,137
4,2008,89


In [15]:
google_trends_fake_rating_years_reduced = google_trends_fake_rating_years.loc[(google_trends_fake_rating_years['year'] >= 2011) & (google_trends_fake_rating_years['year'] < 2018)]
google_trends_fake_rating_years_reduced.head(8)

Unnamed: 0,year,rating_amount
7,2011,151
8,2012,141
9,2013,146
10,2014,137
11,2015,141
12,2016,174
13,2017,276


In [16]:
google_trends_fake_rating_years_reduced['rating_amount'] = (google_trends_fake_rating_years_reduced['rating_amount'] / google_trends_fake_rating_years_reduced['rating_amount'].sum() * 100)
google_trends_fake_rating_years_reduced



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,year,rating_amount
7,2011,12.950257
8,2012,12.092624
9,2013,12.521441
10,2014,11.749571
11,2015,12.092624
12,2016,14.922813
13,2017,23.670669


In [17]:
fig = go.Figure(data=go.Scatter(x=google_trends_fake_rating_years_reduced['year'], y=google_trends_fake_rating_years_reduced['rating_amount']))
fig.show()

In [18]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=google_trends_fake_rating_years_reduced['year'],
        y=google_trends_fake_rating_years_reduced['rating_amount']
    ))

fig.add_trace(
    go.Bar(
        x=single_user_ratings_binned_in_years_reduced['year'],
        y=single_user_ratings_binned_in_years_reduced['rating_amount']
    ))

In [19]:
years_all_ratings = calcluate_rating_amount_grouped_by_year(all_ratings, 'rating_date')

In [20]:
years_all_ratings.head()

Unnamed: 0,year,rating_amount
0,2016,2077207
1,2000,2034733
2,2017,1974032
3,2015,1908076
4,2005,1849611


In [21]:
years_all_ratings['rating_amount'].sum()

27753444

In [22]:
all_user_ratings_binned_in_years_reduced = years_all_ratings.loc[(years_all_ratings['year'] >= 2010) & (years_all_ratings['year'] < 2018)]

In [23]:
all_user_ratings_binned_in_years_reduced.head(9)

Unnamed: 0,year,rating_amount
0,2016,2077207
2,2017,1974032
3,2015,1908076
15,2010,981847
17,2011,834175
18,2012,793162
20,2013,632798
21,2014,585109


In [24]:
def yearly_growth(df, year_column, number_column):
  # Create a new dataframe to store the results
  growth_df = pd.DataFrame(columns=['year', 'growth'])
  
  # Sort the dataframe by the year column
  df = df.sort_values(by=year_column)
  
  # Iterate over the rows of the dataframe
  for i, row in df.iterrows():
    # Get the year and number for the current row
    year = row[year_column]
    number = row[number_column]
    
    # Get the number for the previous year
    prev_year = df[df[year_column] == year - 1][number_column]
    
    # If there is no number for the previous year, skip this row
    if prev_year.empty:
      continue
    
    # Calculate the growth between the previous year and the current year
    growth = (number - prev_year.iloc[0]) / prev_year.iloc[0]
    
    # Convert the growth to percentage
    growth_percentage = growth * 100
    
    # Add the year and growth to the results dataframe
    growth_df = growth_df.append({'year': year, 'growth': growth_percentage}, ignore_index=True)
  
  growth_df['year'] = growth_df['year'].astype('int64')
  return growth_df

In [25]:
all_user_ratings_growth = yearly_growth(all_user_ratings_binned_in_years_reduced, 'year', 'rating_amount')
all_user_ratings_growth.head(9)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,year,growth
0,2011,-15.040225
1,2012,-4.916594
2,2013,-20.218316
3,2014,-7.536212
4,2015,226.106076
5,2016,8.863955
6,2017,-4.967006


In [26]:
single_user_ratings_binned_in_years_less_reduced = df_single_user_ratings_binned_in_years.loc[(df_single_user_ratings_binned_in_years['year'] >= 2010) & (df_single_user_ratings_binned_in_years['year'] < 2018)]
single_user_ratings_growth = yearly_growth(single_user_ratings_binned_in_years_less_reduced, 'year', 'rating_amount')
single_user_ratings_growth.head(9)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,year,growth
0,2011,9.71223
1,2012,-10.819672
2,2013,9.926471
3,2014,2.006689
4,2015,63.606557
5,2016,3.006012
6,2017,12.451362


In [27]:
growth_user_single_ratings_compared_all_ratings = single_user_ratings_growth.copy()
growth_user_single_ratings_compared_all_ratings['growth'] = growth_user_single_ratings_compared_all_ratings['growth'] - all_user_ratings_growth['growth']
growth_user_single_ratings_compared_all_ratings.head(9)

Unnamed: 0,year,growth
0,2011,24.752455
1,2012,-5.903078
2,2013,30.144787
3,2014,9.542901
4,2015,-162.499519
5,2016,-5.857943
6,2017,17.418368


In [28]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=growth_user_single_ratings_compared_all_ratings['year'],
        y=growth_user_single_ratings_compared_all_ratings['growth']
    ))

In [37]:
figure = make_subplots(rows=1, cols=2,
                        subplot_titles=("Comparison of rating amount from users <br> with only one rating to search traffic of the term<br> \"fake rating\" between the years 20011 and 2017",
                                        "Yearly growth of ratings from users with only one rating<br> in relation to the growth of all ratings"), horizontal_spacing=0.2)
colors = px.colors.sequential.Viridis

trace_bar_trends = go.Bar(
        x=single_user_ratings_binned_in_years_reduced['year'],
        y=single_user_ratings_binned_in_years_reduced['rating_amount'],
        marker_color=colors[3], name='Rating amount',
        legendgroup='1',
        legendgrouptitle_text="Comparison chart:"
    )

trace_scatter_google_trends = go.Scatter(
        x=google_trends_fake_rating_years_reduced['year'],
        y=google_trends_fake_rating_years_reduced['rating_amount'],
        line=dict(color=colors[8], width=2), name='Google trends<br>traffic', legendgroup='1'
    )

figure.add_traces([trace_bar_trends, trace_scatter_google_trends], 1, 1)

figure.add_traces(go.Bar(x=growth_user_single_ratings_compared_all_ratings['year'],
                        y=growth_user_single_ratings_compared_all_ratings['growth'],
                        marker_color=colors[6], name='Growth in<br> percentage', legendgroup='2', legendgrouptitle_text="Growth chart:"), 1, 2)

figure.add_hline(y=5, line_dash="dot", row=1, col=1, line=dict(color='black'))
figure.add_hline(y=10, line_dash="dot", row=1, col=1, line=dict(color='black'))
figure.add_hline(y=15, line_dash="dot", row=1, col=1, line=dict(color='black'))
figure.add_hline(y=20, line_dash="dot", row=1, col=1, line=dict(color='black'))

figure.add_hline(y=-150, line_dash="dot", row=1, col=2, line=dict(color='black', width=1))
figure.add_hline(y=-10, line_dash="dot", row=1, col=2, line=dict(color='black', width=1))
figure.add_hline(y=0, row=1, col=2, line=dict(color='black', width=1))
figure.add_hline(y=10, line_dash="dot", row=1, col=2, line=dict(color='black', width=1))
figure.add_hline(y=20, line_dash="dot", row=1, col=2, line=dict(color='black', width=1))
figure.add_hline(y=30, line_dash="dot", row=1, col=2, line=dict(color='black', width=1))

figure.update_layout(legend=dict(x=0.45, y=1))
figure.update_layout(paper_bgcolor='#FFFFFF')

figure.update_layout(
    legend_tracegroupgap = 20
)

figure.update_layout(plot_bgcolor="#FFFFFF")

#figure.update_layout(yaxis=dict(tickformat=''))
figure.update_yaxes(ticksuffix='%')

figure.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 2011,
        dtick = 1
    ),
    xaxis2 = dict(
        tickmode = 'linear',
        tick0 = 2011,
        dtick = 1
    ),
    yaxis2 = dict(
        tickmode = 'array',
        tickvals = [-150, -10, 0, 10, 20, 30]
),
 height=700)
#figure.layout.annotations[0].update(xanchor='right')
#figure.layout.annotations[0].update(x=0.3)

figure.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [5, 10, 15, 20, 25],
        ticktext = [20, 40, 60, 80, 100]
    )
)

figure.show()