## This first cell will be used to download the necessary modules required to run the notebook. This is a one time process and can be removed once you have all the necessary modules installed on your system.

In [136]:
!pip install requests
!pip install pandas
!pip install numpy
!pip install bs4
!pip install sklearn
!pip install flask==0.12.2 
!pip install flask-ngrok



In [0]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import pickle
import re
from flask import Flask,request
from flask_ngrok import run_with_ngrok

In [0]:
def get_wiki_data(url):
  page = requests.get(url)
  html = page.text
  soup = BeautifulSoup(html)
  content = soup.find_all("p")
  data = []
  for paragraph in content:
    data.append(paragraph.text)
  data = " ".join(data)
  return data

In [0]:
wiki_intial = "https://en.wikipedia.org/wiki/"

In [0]:
def get_list_of_imdb(url):
  html = requests.get(url).text
  content = BeautifulSoup(html)
  req_list = []
  for cont in content.find_all("div", {"class":"lister-item mode-detail"}):
    req_list.append(cont.find("h3").find("a").text)
  return req_list

In [0]:
actor_list_holly = get_list_of_imdb("https://www.imdb.com/list/ls053501318/")
names_holly = []
for actor in actor_list_holly:
  name = actor
  name = re.sub("\n", "", name)
  name = re.sub("\s", "_", name)
  name = re.sub("^_", "", name)
  names_holly.append(name)
text_data_holly = []
for name in names_holly:
  text_data_holly.append(get_wiki_data(wiki_intial+name))

In [142]:
hollywood = pd.DataFrame()
hollywood["text"] = text_data_holly
hollywood["label"] = ["Celebrity" for i in range(len(names_holly))]
hollywood.head()

Unnamed: 0,text,label
0,"\n John Christopher Depp II (born June 9, 1963...",Celebrity
1,\n \n Alfredo James Pacino (/pəˈtʃiːnoʊ/; Ital...,Celebrity
2,\n \n Robert Anthony De Niro Jr. (/də ˈnɪəroʊ/...,Celebrity
3,"\n \n Kevin Spacey Fowler KBE (born July 26, 1...",Celebrity
4,\n \n Denzel Hayes Washington Jr. (born Decemb...,Celebrity


In [0]:
def create_category_data(names,label):
  text_data = [get_wiki_data(wiki_intial+i) for i in names]
  data = pd.DataFrame()
  data["text"] = text_data
  data["label"] = [label for i in range(len(names))]
  return data

In [0]:
def throw_bad_data(names,label):
  data = create_category_data(names,label)
  ind = []
  for i,j in enumerate(data["text"]):
    if "may refer to:" in j:
      ind.append(i)
  irrelevant = data.index.isin(ind)
  data = data[~irrelevant]
  return(data)

In [145]:
chem = ["compounds", "atoms", "molecules", "ions", "reaction", 
			 "chemist", "substances", "laboratory", "particles",
			 "energy", "radioactive", "matter", "photon", "mass",
			 "atomic_nucleus", "ionization", "metallic", "covalent",
			 "periodic", "isotopes", "solids", "silica", "sulfur", 
			 "sodium", "carbon", "alloy", "avogadro", "liquid", "oxidation",
			 "chloride", "plasma", "hydronium", "quanta", "phonons", "hydrogen",
			 "sulfied", "kinetics", "hydroxide", "phosphate", "redox", "molar",
			 "molarity", "helium", "krypton"]
chemistry = throw_bad_data(chem,"Chemistry")
chemistry.head()

Unnamed: 0,text,label
1,\n An atom is the smallest constituent unit of...,Chemistry
2,\n A molecule is an electrically neutral group...,Chemistry
3,"\n An ion (/ˈaɪɒn, -ən/)[1] is an atom or mole...",Chemistry
4,"Reaction may refer to a response to an action,...",Chemistry
5,A chemist (from Greek chēm (ía) alchemy; repla...,Chemistry


In [146]:
phys = ["energy", "force", "astronomy", "sun", "moon", "stars",
		   "planets", "optic", "gravitation", "gravity", "thermodynamics", 
		   "electromagnetics", "theory_of_relativity", "photoelectric_effect",
		   "speed_of_light", "motion", "superconductivity", "higgs_boson",
		   "quantum", "quantum_mechanics", "supersymmetry", "neutrinos", "meissner_effect",
		   "magnetic_field", "fluxons", "electric_current", "geomagnetism", "inertia",
		   "newton's_laws_of_motion", "classical_mechanics", "kepler's_law","planet",
		   "thermonuclear", "fusion", "orbit", "ellipse"]
physics = throw_bad_data(phys,"Physics")
physics.head()

Unnamed: 0,text,label
0,"\n \n In physics, energy is the quantitative p...",Physics
1,"\n In physics, a force is any interaction that...",Physics
2,\n Astronomy (from Greek: ἀστρονομία) is a nat...,Physics
3,\n \n The Sun is the star at the center of the...,Physics
4,\n \n The Moon is an astronomical body orbitin...,Physics


In [147]:
maths = ["arithmetic", "algebra", "geometry", "analysis", "set_theory",
		 "number_theory", "natural_numbers", "complex_numbers", "infinity",
		 "groups", "rings", "fields", "abstract_algebra", "linear_algenra",
		 "vector_spaces", "analytic_geometry", "optimization", "tensor_calculus",
		 "convex", "topology", "real_analysis", "differential_equations", "euclidean_geometry",
		 "hypotenuse", "theorem", "volumes", "calculus", "integral", "differentiation",
		 "limits", "riemann", "fourier_series", "prime_numbers", "measure",
		 "infinite_series", "metric_space", "sine", "taylor_series", "lebesgue",
		 "hilbert_spaces", "functional_analysis", "ordered_pair"]
mathematics = create_category_data(maths,"Mathematics")
mathematics.head()

Unnamed: 0,text,label
0,"Arithmetic (from the Greek ἀριθμός arithmos, ""...",Mathematics
1,"\n Algebra (from Arabic: الجبر‎ (al-jabr, mean...",Mathematics
2,\n Geometry (from the Ancient Greek: γεωμετρία...,Mathematics
3,\nAnalysis is the process of breaking a comple...,Mathematics
4,Set theory is a branch of mathematical logic t...,Mathematics


In [148]:
stats = ["probability", "surveys_sampling", "experiments", "inference", "observational_study",
	     "data_analysis", "mean", "variance", "dispersion", "descriptive_statistics",
	     "null_hypothesis", "missing_data", "probability_theory", "sampling", "estimator",
	     "statistic", "covariance", "mean_squared_error", "consistency",
	     "maximum_likelihood", "least_squares", "power_test", "random", "bayesian",
	     "frequentist", "significance", "critical_region","correlation", "regression", 
	     "logistic_regression", "factor_analysis", "cojoint_analysis","parametric",
	     "time_series_analysis", "bootstrap", "markov_chain"]
statistics = create_category_data(stats,"Statistics")
statistics.head()

Unnamed: 0,text,label
0,\n Related concepts and fundamentals:\n Probab...,Statistics
1,Other reasons this message may be displayed:\n,Statistics
2,An experiment is a procedure carried out to su...,Statistics
3,"Inferences are steps in reasoning, moving from...",Statistics
4,"In fields such as epidemiology, social science...",Statistics


In [149]:
eco = ["production", "distribution_(economics)", "consumption_(economics)", "microeconomics",
		"macroeconomics", "demand", "supply", "national_income", "deficit",
		"resources", "market_equilibrium", "oligopoly", "monopoly", "duopoly",
		"monopsony", "investment_goods", "public_goods", "opportunity_cost",
		"factors_of_production", "pareto_efficiency", "production_possiblity_frontier",
		"scarcity", "constraints", "income_effect", "substitution_effect",
		"marginal_utility", "marginal_cost", "elasticity", "marginal_revenue",
		"perfectly_competitive","game_theory","economic_growth", "money","balance_of_payments",
		"central_bank", "fiscal_policy", "monetary_policy", "monye_supply",
		"exchange_rates", "externalities"]
economics = create_category_data(eco,"Economics")
economics.head()

Unnamed: 0,text,label
0,Production may be:\n,Economics
1,"In economics, distribution is the way total ou...",Economics
2,"Consumption, defined as spending for acquisiti...",Economics
3,\n Microeconomics (from Greek prefix mikro- me...,Economics
4,Heterodox\n Macroeconomics (from the Greek pre...,Economics


In [150]:
main_data = pd.concat((hollywood,chemistry,physics,mathematics,statistics,economics),ignore_index=True)
main_data.head()

Unnamed: 0,text,label
0,"\n John Christopher Depp II (born June 9, 1963...",Celebrity
1,\n \n Alfredo James Pacino (/pəˈtʃiːnoʊ/; Ital...,Celebrity
2,\n \n Robert Anthony De Niro Jr. (/də ˈnɪəroʊ/...,Celebrity
3,"\n \n Kevin Spacey Fowler KBE (born July 26, 1...",Celebrity
4,\n \n Denzel Hayes Washington Jr. (born Decemb...,Celebrity


In [151]:
label_mapper = {"Celebrity":0, "Chemistry":1, "Physics":2,
               "Mathematics":3, "Statistics":4, "Economics":5}
main_data["label_number"] = main_data["label"].map(label_mapper)
main_data.head()

Unnamed: 0,text,label,label_number
0,"\n John Christopher Depp II (born June 9, 1963...",Celebrity,0
1,\n \n Alfredo James Pacino (/pəˈtʃiːnoʊ/; Ital...,Celebrity,0
2,\n \n Robert Anthony De Niro Jr. (/də ˈnɪəroʊ/...,Celebrity,0
3,"\n \n Kevin Spacey Fowler KBE (born July 26, 1...",Celebrity,0
4,\n \n Denzel Hayes Washington Jr. (born Decemb...,Celebrity,0


In [0]:
stop = ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your','yours',
        'yourself','yourselves','he','him','his','himself','she',"she's",'her','hers','herself','it',"it's",'its',
        'itself','they','them','their','theirs','themselves','what','which','who','whom','this','that',"that'll",
        'these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does',
        'did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for',
        'with','about','against','between','into','through','during','before','after','above','below','to','from',
        'up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where',
        'why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own',
        'same','so','than','too','very','s','t','can','will','just','don',"don't",'should',"should've",'now','d','ll',
        'm','o','re','ve','y','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",
        'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',
        "shan't", 'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"]

In [0]:
def clean_text(text):
  text = text.lower()
  tokens = text.split()
  new = [word for word in tokens if word not in stop]
  text = " ".join(new)
  text = " ".join(re.split(r"[^A-Za-z]", text))
  text = re.sub(" +", " ", text)
  return(text)

In [0]:
cleaned = []
for i in main_data.text:
  cleaned.append(clean_text(i))

In [155]:
main_data["clean_text"] = cleaned
main_data.head()

Unnamed: 0,text,label,label_number,clean_text
0,"\n John Christopher Depp II (born June 9, 1963...",Celebrity,0,john christopher depp ii born june american ac...
1,\n \n Alfredo James Pacino (/pəˈtʃiːnoʊ/; Ital...,Celebrity,0,alfredo james pacino p t i no italian pa t i n...
2,\n \n Robert Anthony De Niro Jr. (/də ˈnɪəroʊ/...,Celebrity,0,robert anthony de niro jr d n ro italian de ni...
3,"\n \n Kevin Spacey Fowler KBE (born July 26, 1...",Celebrity,0,kevin spacey fowler kbe born july american act...
4,\n \n Denzel Hayes Washington Jr. (born Decemb...,Celebrity,0,denzel hayes washington jr born december ameri...


In [0]:
X = main_data.clean_text
y = main_data.label_number

In [0]:
pipeline = Pipeline([
    ("count_vect", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("clf", MultinomialNB())
])

In [158]:
pipeline.fit(X,y)

Pipeline(memory=None,
         steps=[('count_vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [0]:
def from_url_to_predict_data(url):
  text = get_wiki_data(url)
  cleaned_text = clean_text(text)
  data_for_prediction = pd.Series(cleaned_text)
  return data_for_prediction

def get_prediction(url,model):
  req_data = from_url_to_predict_data(url)
  pred = model.predict(req_data)
  return [name for name,code in label_mapper.items() if code == pred][0]

In [160]:
get_prediction(r"https://en.wikipedia.org/wiki/Machine_learning", pipeline)

'Statistics'

In [161]:
get_prediction(r"https://en.wikipedia.org/wiki/Pareto", pipeline)

'Economics'

In [162]:
get_prediction(r"https://en.wikipedia.org/wiki/computer_science", pipeline)

'Mathematics'

In [163]:
get_prediction(r"https://en.wikipedia.org/wiki/ramanujan", pipeline)

'Celebrity'

In [164]:
get_prediction(r"https://en.wikipedia.org/wiki/hardy", pipeline)

'Mathematics'

In [165]:
get_prediction(r"https://en.wikipedia.org/wiki/confidence_interval", pipeline)

'Statistics'

In [166]:
get_prediction(r"https://en.wikipedia.org/wiki/factor_analysis", pipeline)

'Statistics'

In [167]:
get_prediction(r"https://en.wikipedia.org/wiki/big_bang_theory", pipeline)

'Physics'

In [0]:
app = Flask(__name__)
run_with_ngrok(app)

In [0]:
html_style = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>New Style</title>
    <style>
        body,html,*{
            margin:0px;
            padding: 0px;
        }
        .nesto-background{
            background: url(https://dtsvkkjw40x57.cloudfront.net/1350xnull/8160/uploads/2ab85f0f-bc62-417f-8a33-a064535a6ebd.jpg);
            opacity: 0.9;
            height: 300px;
            width: 100%;
            margin: 0px 0px 10px 0px;
            display: flex;
            flex-direction: column;
            justify-content: center;
            align-items: center;
        }
        .nesto-background div{
            align-self: flex-end;
        }

        h1 {
            border: 0px solid #F49200;
            background-color: #F49200;
            border-radius: 5px;
        }
        .remaining{
            display: flex;
            flex-direction: row;
        }
        .form{
            width: 60%;
            align-self: flex-start;
            margin-top: 100px;
        }
        .info{
            align-self: flex-end;
            display: flex;
            width:40%;
            flex-direction: column;
            justify-content: center;
            align-items: center;
            margin-top: 100px;
            margin-left:15px;
        }
        
        form{
            display: flex;
            flex-direction: column;
        }
        input[name=url]{
            width: 200px;
            height: 25px;
            margin-top: 20px;
            align-self: center;
        }

        input[name=submit]{
            width: 100px;
            height: 30px;
            outline: none;
            border: none;
            background-color: #F49200;
            color: white;
            margin-top: 10px;
            align-self: center;
        }

        h4{
            font-size: 20px;
            color: #F49200;
        }
        h3{
            color: red;
            margin-top: 10px;
        }
        ul{
            margin-top: 10px;
        }
        li{
            margin-left: 30px;
        }
        h2{
            color: #F49200;
        }
    </style>
</head>
'''

In [0]:
html_string_not_posted = html_style + '''
<body>
    <div class="nesto-background">
        <h1> Welcome to the Wikipedia Content Classification Application</h1>
        <div> <img src = "https://raw.githubusercontent.com/datageekrj/ForHostingFiles/master/NestoTV_1280x720%20(1).png" width = "200" height = "80" style = "align-self:center;"></div>
    </div>
    <div class="remaining">
        <div class="form">
            <h4>To use this application, Just Enter the URL of the wikipedia Page that you want to classify and hit submit..</h4>
            <form method="POST">
                <input type="text" name="url" class="url" placeholder="Enter URL">
                <input type="submit" name="submit" class="submit">
            </form>
        </div>
        <div class="info categories">
            <h2>Categories Included while Training the Naive Bayes Algorithm: </h2>
                <ul>
                    <li>Celebrity</li>
                    <li>Chemistry</li>
                    <li>Physics</li>
                    <li>Statistics</li>
                    <li>Mathematics</li>
                    <li>Economics</li>
                <h3> Data Source: Data Was Scraped Using Wikipedia by Python using BeautifulSoup</h3>
                </ul>
        </div>
    </div>
</body>
</html>
'''

Whereas on the other hand, once the user is displayed the prediction, there are only two things. 

* One is the `h1` tag which displays tha prediction and 
* another is a `a` tag for going back to the first state of the application. 

In [0]:
def get_html_posted_string(data):
  required_string = html_style + '''
<body>
    <div class="nesto-background">
        <h1> Welcome to the Wikipedia Content Classification Application</h1>
        <div> <img src = "https://raw.githubusercontent.com/datageekrj/ForHostingFiles/master/NestoTV_1280x720%20(1).png" width = "200" height = "80" style = "align-self:center;"></div>
    </div>
    <div class="remaining">
        <div style = "display:flex;flex-direction:column;align-items:center;justify-content:center;">
            <h1>Your URL that you have entered belongs to: <span style="color: #707070;">''' + data + '''
</span></h1><br>
        <a href="/">Go Back</a>
        </div>
        <div class="info categories">
            <h2>Categories Included while Training the Naive Bayes Algorithm: </h2>
                <ul>
                    <li>Celebrity</li>
                    <li>Chemistry</li>
                    <li>Physics</li>
                    <li>Statistics</li>
                    <li>Mathematics</li>
                    <li>Economics</li>
                <h3> Data Source: Data Was Scraped Using Wikipedia by Python using BeautifulSoup</h3>
                </ul>
        </div>
    </div>
</body>
</html>'''
  return required_string

In [0]:
@app.route('/', methods = ["GET", "POST"])
def index():
    if request.method == "POST":
        url = request.form.get("url")
        prediction = get_prediction(url, pipeline)
        required_posted_string = get_html_posted_string(prediction)
        return required_posted_string
    else:
        return html_string_not_posted

In [0]:
if __name__ == "__main__":
  app.run()

 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://0c9b1779.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [01/May/2020 09:45:23] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [01/May/2020 09:45:25] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
