In [91]:
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # graphs and charts

import bq_helper # accessing bigQuery database

import sklearn
from sklearn.model_selection import train_test_split # data splitting
import statsmodels.api as sm
from sklearn import metrics
from sklearn.linear_model import LinearRegression # Linear model

import wordcloud

In [92]:
stackoverflow = bq_helper.BigQueryHelper("bigquery-public-data","stackoverflow")

In [93]:
stackoverflow.list_tables()

In [94]:
stackoverflow.head("posts_questions")

In [95]:
stackoverflow.table_schema("posts_questions")

In [96]:
queryx = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >= 2009 and extract(year from creation_date) < 2022
        group by year
        order by year
        """

PostsCount = stackoverflow.query_to_pandas(queryx)
print(PostsCount)

In [97]:
PostsCount.describe()

In [98]:
PostsCount.head()

In [99]:
query4 = """SELECT tags
         FROM `bigquery-public-data.stackoverflow.posts_questions`
         LIMIT 200000;
         """

alltags = stackoverflow.query_to_pandas_safe(query4)
tags = ' '.join(alltags.tags).lower()

In [100]:
cloud = wordcloud.WordCloud(background_color='black',
                            max_font_size=200,
                            width=1600,
                            height=800,
                            max_words=300,
                            relative_scaling=.5).generate(tags)
plt.figure(figsize=(20,10))
plt.axis('off')
plt.savefig('stackOverflow.png')
plt.imshow(cloud);

In [101]:
pd.to_numeric(PostsCount['year'])

In [102]:
year=PostsCount['year'].values.reshape(-1,1)
#print (year)
posts=PostsCount['posts'].values.reshape(-1,1)
#print (posts)

In [103]:
reg = LinearRegression()

In [104]:
X_train, X_test, y_train, y_test = train_test_split(year,posts,test_size=0.2,shuffle=False)

In [105]:
reg.fit(X_train,y_train)
predictions = reg.predict(X_test)

In [106]:
print('Predicted values\n',predictions)

In [107]:
plt.scatter(X_train,y_train, color = "black")
plt.scatter(X_test, y_test, color = "green")
plt.plot(X_test, predictions, color = "red")
plt.gca().legend(('Y-Predicted','Y-Train', 'Y-Test'))
plt.title('Y-train and Y-test and Y-predicted')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [108]:
plt.scatter(X_test, y_test, color = "green")
plt.plot(X_test, predictions, color = "red")
plt.gca().legend(('Y-Train','Y-Test'))
plt.title('Y-test and Y-predicted')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [109]:
reg.score(X_test,y_test)

In [110]:
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predictions))

In [111]:
print('Root Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [112]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and (tags like '%bootstrap%' or 
        tags like '%angularjs%' or tags like '%php%' or tags like '%html%' or tags like '%javascript%' or tags like '%css%')
        group by year
        order by year
        """

WebDev_Posts = stackoverflow.query_to_pandas(query)
WebDev_Posts['posts']= WebDev_Posts['posts']*100/PostsCount.posts
WebDev_Posts

In [113]:
WebDev_Posts.describe()

In [114]:
WebDevYear=WebDev_Posts['year'].values.reshape(-1,1)
#print (WebDevYear)
WebDevPosts=WebDev_Posts['posts'].values.reshape(-1,1)
#print (WebDevPosts)

In [115]:
XWebDev_train, XWebDev_test, yWebDev_train, yWebDev_test = train_test_split(WebDevYear,WebDevPosts,test_size=0.2,shuffle=False)

In [116]:
WebDevReg=LinearRegression()
WebDevReg.fit(XWebDev_train,yWebDev_train)
WebDevPredictions = WebDevReg.predict(XWebDev_test)
print('Predicted Values:\n',WebDevPredictions)

In [117]:
plt.scatter(XWebDev_train,yWebDev_train, color = "black")
plt.scatter(XWebDev_test, yWebDev_test, color = "green")
plt.plot(XWebDev_test, WebDevPredictions, color = "red")
plt.gca().legend(('Y-Predicted','Y-Train', 'Y-Test'))
plt.title('WEB DEVELOPMENT')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [118]:
WebDevReg.score(XWebDev_test,yWebDev_test)

In [119]:
print('Mean Squared Error:',metrics.mean_squared_error(yWebDev_test, WebDevPredictions))

In [120]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(yWebDev_test, WebDevPredictions)))

In [121]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%angularjs%'
        group by year
        order by year
        """

AngularJSPosts = stackoverflow.query_to_pandas(query)
AngularJSPosts['posts']= AngularJSPosts['posts']*100/PostsCount.posts
AngularJSPosts

In [122]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%bootstrap%'
        group by year
        order by year
        """

BootstrapPosts = stackoverflow.query_to_pandas(query)
BootstrapPosts['posts']= BootstrapPosts['posts']*100/PostsCount.posts
pd.to_numeric(BootstrapPosts['year'])
BootstrapPosts

In [123]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%php%'
        group by year
        order by year
        """

PHPPosts = stackoverflow.query_to_pandas(query)
PHPPosts['posts']= PHPPosts['posts']*100/PostsCount.posts
pd.to_numeric(PHPPosts['year'])
PHPPosts

In [124]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%html%'
        group by year
        order by year
        """

htmlPosts = stackoverflow.query_to_pandas(query)
htmlPosts['posts']= htmlPosts['posts']*100/PostsCount.posts
pd.to_numeric(htmlPosts['year'])
htmlPosts

In [125]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%javascript%'
        group by year
        order by year
        """

JavaScriptPosts = stackoverflow.query_to_pandas(query)
JavaScriptPosts['posts']= JavaScriptPosts['posts']*100/PostsCount.posts
pd.to_numeric(JavaScriptPosts['year'])
JavaScriptPosts

In [126]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%css%'
        group by year
        order by year
        """

CSSPosts = stackoverflow.query_to_pandas(query)
CSSPosts['posts']= CSSPosts['posts']*100/PostsCount.posts
pd.to_numeric(CSSPosts['year'])
CSSPosts

In [127]:
WebDev= pd.merge(PHPPosts, htmlPosts, how='inner', on = 'year')
WebDev=WebDev.set_index('year')
WebDev= pd.merge(WebDev, JavaScriptPosts, how='inner', on = 'year')
WebDev =WebDev.set_index('year')
WebDev=pd.merge(WebDev,AngularJSPosts,how='inner',on='year')
WebDev = WebDev.set_index('year')
WebDev=pd.merge(WebDev,BootstrapPosts,how='inner',on='year')
WebDev = WebDev.set_index('year')
WebDev=pd.merge(WebDev,CSSPosts,how='inner',on='year')
WebDev = WebDev.set_index('year')

WebDev.plot(kind='line')
plt.xlabel('Year', fontsize=15)
plt.ylabel('Posts %', fontsize=15)
y_pos=[2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]

plt.xticks(y_pos,fontsize=10)
plt.yticks(fontsize=10)
plt.title('Web Development')
plt.legend(['PHP','HTML','JavaScript','AngularJS','BootStrap','CSS'],loc=[1.0,0.5])
plt.show()

In [128]:
#mysql,mongodb,nosql,postgresql,cassandra
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date)>= 2009 and extract(year from creation_date) < 2019 
        and (tags like '%mysql%' or tags like '%nosql%' or tags like '%mongodb%' 
        or tags like '%postgresql%' or tags like '%cassandra%')
        group by year
        order by year
        """

DataBase_Posts = stackoverflow.query_to_pandas(query)
DataBase_Posts['posts']= DataBase_Posts['posts']*100/PostsCount.posts
DataBase_Posts

In [129]:
DataBase_Posts.describe()

In [130]:
pd.to_numeric(DataBase_Posts['year'])

In [131]:
DataBaseYear=DataBase_Posts['year'].values.reshape(-1,1)
# print (DataBaseYear)
DataBasePosts=DataBase_Posts['posts'].values.reshape(-1,1)
# print (DataBasePosts)

In [132]:
XDataBase_train, XDataBase_test, yDataBase_train, yDataBase_test = train_test_split(DataBaseYear,DataBasePosts,test_size=0.2,shuffle=False)

In [133]:
DataBaseReg=LinearRegression()
DataBaseReg.fit(XDataBase_train,yDataBase_train)
DataBasePredictions = DataBaseReg.predict(XDataBase_test)
print('Predicted Values:\n',DataBasePredictions)

In [134]:
plt.scatter(XDataBase_train,yDataBase_train, color = "black")
plt.scatter(XDataBase_test, yDataBase_test, color = "green")
plt.plot(XDataBase_test, DataBasePredictions, color = "red")
plt.gca().legend(('Y-Predicted', 'Y-Train','Y-Test'))
plt.title('Database Technologies')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [135]:
plt.scatter(XDataBase_test, yDataBase_test, color = "green")
plt.plot(XDataBase_test, DataBasePredictions, color = "red")
plt.gca().legend(('Y-Train','Y-Test'))
plt.title('Database Technologies')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [136]:
DataBaseReg.score(XDataBase_test, yDataBase_test)

In [137]:
print('Mean Squared Error:', metrics.mean_squared_error(yDataBase_test, DataBasePredictions))

In [138]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(yDataBase_test, DataBasePredictions)))

In [139]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%mysql%'
        group by year
        order by year
        """

MySQLPosts = stackoverflow.query_to_pandas(query)
MySQLPosts['posts']= MySQLPosts['posts']*100/PostsCount.posts
pd.to_numeric(MySQLPosts['year'])
MySQLPosts

In [140]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%mongodb%'
        group by year
        order by year
        """

MongoDBPosts = stackoverflow.query_to_pandas(query)
MongoDBPosts['posts']= MongoDBPosts['posts']*100/PostsCount.posts
pd.to_numeric(MongoDBPosts['year'])
MongoDBPosts

In [141]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%nosql%'
        group by year
        order by year
        """

NoSQLPosts = stackoverflow.query_to_pandas(query)
NoSQLPosts['posts']= NoSQLPosts['posts']*100/PostsCount.posts
pd.to_numeric(NoSQLPosts['year'])
NoSQLPosts

In [142]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%postgresql%'
        group by year
        order by year
        """

PostgreSQLPosts = stackoverflow.query_to_pandas(query)
PostgreSQLPosts['posts']= PostgreSQLPosts['posts']*100/PostsCount.posts
pd.to_numeric(PostgreSQLPosts['year'])
PostgreSQLPosts

In [143]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 
        and tags like '%cassandra%'
        group by year
        order by year
        """

CassandraPosts = stackoverflow.query_to_pandas(query)
CassandraPosts['posts']= CassandraPosts['posts']*100/PostsCount.posts
pd.to_numeric(CassandraPosts['year'])
CassandraPosts

In [144]:
DataBase= pd.merge(MySQLPosts, NoSQLPosts, how='inner', on = 'year')
DataBase=DataBase.set_index('year')
DataBase= pd.merge(DataBase, MongoDBPosts, how='inner', on = 'year')
DataBase=DataBase.set_index('year')
DataBase= pd.merge(DataBase, PostgreSQLPosts, how='inner', on = 'year')
DataBase=DataBase.set_index('year')
DataBase= pd.merge(DataBase, CassandraPosts, how='inner', on = 'year')
DataBase=DataBase.set_index('year')


DataBase.plot(kind='line')
plt.xlabel('Year', fontsize=15)
plt.ylabel('Posts %', fontsize=15)
y_pos=[2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]

plt.xticks(y_pos,fontsize=10)
plt.yticks(fontsize=10)
plt.title('Database Technologies')
plt.legend(['MySQL','NoSQL','MongoDB','PostgreSQL','Cassandra'],loc=[1.0,0.5])
plt.show()

In [145]:
#hadoop,hive,spark,hbase,kafka
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date)>= 2009 and extract(year from creation_date) < 2019 and (tags like '%hadoop%' or 
        tags like '%spark%' or tags like '%hive%' or tags like '%hbase%' or tags like '%kafka%')
        group by year
        order by year
        """

BigData_Posts = stackoverflow.query_to_pandas(query)
BigData_Posts['posts']= BigData_Posts['posts']*100/PostsCount.posts
BigData_Posts

In [146]:
BigData_Posts.describe()

In [147]:
pd.to_numeric(BigData_Posts['year'])

In [148]:
BigDataYear=BigData_Posts['year'].values.reshape(-1,1)
# print (BigDataYear)
BigDataPosts=BigData_Posts['posts'].values.reshape(-1,1)
# print (BigDataPosts)

In [149]:
XBigData_train, XBigData_test, yBigData_train, yBigData_test = train_test_split(BigDataYear,BigDataPosts,test_size=0.2,shuffle=False)

In [150]:
BigDataReg=LinearRegression()
BigDataReg.fit(XBigData_train,yBigData_train)
BigDataPredictions = BigDataReg.predict(XBigData_test)
print('Predicted Values:\n',BigDataPredictions)

In [151]:
plt.scatter(XBigData_train,yBigData_train, color = "black")
plt.scatter(XBigData_test, yBigData_test, color = "green")
plt.plot(XBigData_test, BigDataPredictions, color = "red")
plt.gca().legend(('Y-Predicted', 'Y-Train','Y-Test'))
plt.title('Big Data')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [152]:
plt.scatter(XBigData_test, yBigData_test, color = "green")
plt.plot(XBigData_test, BigDataPredictions, color = "red")
plt.gca().legend(('Y-Train','Y-Test'))
plt.title('Big Data')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [153]:
BigDataReg.score(XBigData_test, yBigData_test)

In [154]:
print('Mean Squared Error:', metrics.mean_squared_error(yBigData_test, BigDataPredictions))

In [155]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(yBigData_test, BigDataPredictions)))

In [157]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%hadoop%'
        group by year
        order by year
        """

HadoopPosts = stackoverflow.query_to_pandas(query)
HadoopPosts['posts']= HadoopPosts['posts']*100/PostsCount.posts
pd.to_numeric(HadoopPosts['year'])
HadoopPosts

In [158]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%hive%'
        group by year
        order by year
        """

HivePosts = stackoverflow.query_to_pandas(query)
HivePosts['posts']= HivePosts['posts']*100/PostsCount.posts
pd.to_numeric(HivePosts['year'])
HivePosts

In [159]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%spark%'
        group by year
        order by year
        """

SparkPosts = stackoverflow.query_to_pandas(query)
SparkPosts['posts']= SparkPosts['posts']*100/PostsCount.posts
pd.to_numeric(SparkPosts['year'])
SparkPosts

In [160]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%hbase%'
        group by year
        order by year
        """

HBasePosts = stackoverflow.query_to_pandas(query)
HBasePosts['posts']= HBasePosts['posts']*100/PostsCount.posts
pd.to_numeric(HBasePosts['year'])
HBasePosts

In [162]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%kafka%'
        group by year
        order by year
        """

KafkaPosts = stackoverflow.query_to_pandas(query)
KafkaPosts['posts']= KafkaPosts['posts']*100/PostsCount.posts
pd.to_numeric(KafkaPosts['year'])
KafkaPosts

In [163]:
df = pd.DataFrame({"year":[2009,2010],"posts":[0,0]})
KafkaPosts = KafkaPosts.append(df, ignore_index = True)
KafkaPosts.sort_values("year", axis = 0, ascending = True, inplace = True)
KafkaPosts = KafkaPosts.reset_index(drop=True)
KafkaPosts

In [164]:
BigData= pd.merge(HadoopPosts, SparkPosts, how='inner', on = 'year')
BigData=BigData.set_index('year')
BigData= pd.merge(BigData, HivePosts, how='inner', on = 'year')
BigData=BigData.set_index('year')
BigData= pd.merge(BigData, HBasePosts, how='inner', on = 'year')
BigData=BigData.set_index('year')
BigData= pd.merge(BigData, KafkaPosts, how='inner', on = 'year')
BigData=BigData.set_index('year')

BigData.plot(kind='line')
plt.xlabel('Year', fontsize=15)
plt.ylabel('Posts %', fontsize=15)
y_pos=[2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]

plt.xticks(y_pos,fontsize=10)
plt.yticks(fontsize=10)
plt.title('Big Data')
plt.legend(['Hadoop','Spark','Hive','HBase','Kafka'],loc=[1.0,0.5])
plt.show()

In [165]:
#pandas,matplotlib,regression,svm,kaggle
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date)>= 2009 and extract(year from creation_date) < 2019 
        and (tags like '%pandas%' or tags like '%matplotlib%'
        or tags like '%regression%' or tags like '%svm%' or tags like '%kaggle%')
        group by year
        order by year
        """

DataScience_Posts = stackoverflow.query_to_pandas(query)
DataScience_Posts['posts']= DataScience_Posts['posts']*100/PostsCount.posts
DataScience_Posts

In [166]:
DataScience_Posts.describe()

In [167]:
pd.to_numeric(DataScience_Posts['year'])

In [168]:
DataScienceYear=DataScience_Posts['year'].values.reshape(-1,1)
# print (DataScienceYear)
DataSciencePosts=DataScience_Posts['posts'].values.reshape(-1,1)
# print (DataSciencePosts)

In [169]:
XDataScience_train, XDataScience_test, yDataScience_train, yDataScience_test = train_test_split(DataScienceYear,DataSciencePosts,test_size=0.2,shuffle=False)

In [170]:
DataScienceReg=LinearRegression()
DataScienceReg.fit(XDataScience_train,yDataScience_train)
DataSciencePredictions = DataScienceReg.predict(XDataScience_test)
print('Predicted Values:\n',DataSciencePredictions)

In [171]:
plt.scatter(XDataScience_train,yDataScience_train, color = "black")
plt.scatter(XDataScience_test, yDataScience_test, color = "green")
plt.plot(XDataScience_test, DataSciencePredictions, color = "red")
plt.gca().legend(('Y-Predicted', 'Y-Train','Y-Test'))
plt.title('Data Science')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [172]:
plt.scatter(XDataScience_test, yDataScience_test, color = "green")
plt.plot(XDataScience_test, DataSciencePredictions, color = "red")
plt.gca().legend(('Y-Train','Y-Test'))
plt.title('Data Science')
plt.xlabel('Year')
plt.ylabel('Posts')
plt.show()

In [173]:
DataScienceReg.score(XDataScience_test,yDataScience_test)

In [174]:
print('Mean Squared Error:', metrics.mean_squared_error(yDataScience_test, DataSciencePredictions))

In [175]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(yDataScience_test, DataSciencePredictions)))

In [176]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%pandas%'
        group by year
        order by year
        """

PandasPosts = stackoverflow.query_to_pandas(query)
PandasPosts['posts']= PandasPosts['posts']*100/PostsCount.posts
pd.to_numeric(PandasPosts['year'])
PandasPosts

In [177]:
df = pd.DataFrame({"year":[2009],"posts":[0]})
PandasPosts = PandasPosts.append(df, ignore_index = True)
PandasPosts.sort_values("year", axis = 0, ascending = True, inplace = True)
PandasPosts = PandasPosts.reset_index(drop=True)
PandasPosts

In [178]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 and tags like '%matplotlib%'
        group by year
        order by year
        """

MatplotlibPosts = stackoverflow.query_to_pandas(query)
MatplotlibPosts['posts']= MatplotlibPosts['posts']*100/PostsCount.posts
pd.to_numeric(MatplotlibPosts['year'])
MatplotlibPosts

In [179]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 
        and tags like '%regression%'
        group by year
        order by year
        """

RegressionPosts = stackoverflow.query_to_pandas(query)
RegressionPosts['posts']= RegressionPosts['posts']*100/PostsCount.posts
pd.to_numeric(RegressionPosts['year'])
RegressionPosts

In [180]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 
        and tags like '%svm%'
        group by year
        order by year
        """

SVMPosts = stackoverflow.query_to_pandas(query)
SVMPosts['posts']= SVMPosts['posts']*100/PostsCount.posts
pd.to_numeric(SVMPosts['year'])
SVMPosts

In [181]:
query = """select EXTRACT(year FROM creation_date) AS year, sum(id) as posts
        from `bigquery-public-data.stackoverflow.posts_questions`
        where extract(year from creation_date) >=2009 and extract(year from creation_date) < 2019 
        and tags like '%kaggle%'
        group by year
        order by year
        """

KagglePosts = stackoverflow.query_to_pandas(query)
KagglePosts['posts']= KagglePosts['posts']*100/PostsCount.posts
pd.to_numeric(KagglePosts['year'])
KagglePosts

In [182]:
df = pd.DataFrame({"year":[2009,2010],"posts":[0,0]})
KagglePosts = KagglePosts.append(df, ignore_index = True)
KagglePosts.sort_values("year", axis = 0, ascending = True, inplace = True)
KagglePosts = KagglePosts.reset_index(drop=True)
KagglePosts

In [183]:
DataScience= pd.merge(PandasPosts, MatplotlibPosts, how='inner', on = 'year')
DataScience=DataScience.set_index('year')
DataScience= pd.merge(DataScience, RegressionPosts, how='inner', on = 'year')
DataScience=DataScience.set_index('year')
DataScience= pd.merge(DataScience, SVMPosts, how='inner', on = 'year')
DataScience=DataScience.set_index('year')
DataScience= pd.merge(DataScience, KagglePosts, how='inner', on = 'year')
DataScience=DataScience.set_index('year')

DataScience.plot(kind='line')
plt.xlabel('Year', fontsize=15)
plt.ylabel('Posts %', fontsize=15)
y_pos=[2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]

plt.xticks(y_pos,fontsize=10)
plt.yticks(fontsize=10)
plt.title('Data Science')
plt.legend(['Pandas','Matplotlib','Regression','SVM','Kaggle'],loc=[1.0,0.5])
plt.show()