In [None]:
# for manipulating dataframes
import pandas as pd

# for webscraping
from requests import get
from bs4 import BeautifulSoup

# for natural language processing
import spacy
import en_core_web_md
nlp = en_core_web_md.load()

In [None]:
url = 'https://en.wikipedia.org/wiki/QAnon'
response = get(url)

In [None]:
print(response.text[:999])

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
article = soup.find('div', class_='mw-parser-output')

In [None]:
print(article.text)

In [None]:
unwanted = article.find('div', role='note')
unwanted.extract()

In [None]:
print(article.text)

In [None]:
doc = nlp.make_doc(article.text)

In [None]:
for token in doc:
    if token.is_alpha:
        if not token.is_stop:
            print(token.text, token.head.text)

In [None]:
text_list = []
head_list = []

for token in doc:
    if token.is_alpha:
        if not token.is_stop:
            text_list.append(token.text.lower())
            head_list.append(token.head.text.lower())

df = pd.DataFrame(list(zip(text_list, head_list)), 
               columns =['text', 'head']) 

In [None]:
df.head(9)

In [None]:
df.to_csv('../data/out/qanon.csv', index=False)

In [None]:
combos = df.groupby(['text','head']).size().reset_index().rename(columns={0:'count'}).sort_values('count', ascending=False)

In [None]:
combos.head(9)

In [None]:
combos.to_csv('../data/out/qanon_combos.csv', index=False)