# TED – Topic Modeling

In [1]:
# standard libraries
from collections import Counter
from collections import OrderedDict
import pickle
import random
import re
import string
import time

# web scraping
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests

# data manipulation & storage
import pandas as pd

# Sklearn
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

# NLP
from gensim import matutils, models
import spacy
import scipy.sparse
# import stanfordnlp
# from spacy_stanfordnlp import StanfordNLPLanguage


# visualization
from spacy import displacy
from tabulate import tabulate
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess


## Options

In [None]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.width', 10)
# pd.options.display.max_columns = None

# Web Scraping
##### Features to be scraped
* Speaker (author)
* Title
* Transcript
* Related tags
* About the talk

##### Nice to have for future work
* Date
* About the speaker
* Views
* Talk length

In [None]:
def make_soup(url):
    """
    Make soup for each Ted Talk transcript url.
    """
    # Generate random user-agent
    user_agent = {'User-agent': UserAgent().random}
    
    # Request page and make soup
    page = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(page.content, 'lxml')
    return soup

In [None]:
def get_max_page():
    """
    Get's the max pagination number from Ted's home page.
    """
    page_num = []
    
    # Set language to English and sort by newest talks
    soup = make_soup('https://www.ted.com/talks?language=en&page=1&sort=newest')
    
    # Iterate through each pagination element and get the max
    for element in soup.find_all('a', class_='pagination__item pagination__link'):
        page_num.append(int(element.text))
    return max(page_num)

In [None]:
def get_talk_urls():
    """
    Get's all the talk urls in each talks page.
    Filters for talks in English.
    """
    talk_url_list = []
    
    # Set language to English
    talks_url = 'https://www.ted.com/talks?language=en&page='
    
    # Set range from 1 to the max page in the pagination element
    page_range = range(1, get_max_page()+1)
    
    # Iterate through each page and get the url for each talk
    for i in page_range:
        
        # Try a second attempt if first attempt fails
        for attempt in range(2):
            try:
                talks_page_url = talks_url + str(i) + '&sort=newest'
                soup = make_soup(talks_page_url)

                # Delay between searches
                time.sleep(random.uniform(1, 2))

                for div in soup.find_all('div', attrs={'class': 'media__image'}):
                    for a in div.find_all('a'):
                        talk_url_list.append(a.get('href'))
            except:
                # Delay before continuing to second attempt
                time.sleep(random.uniform(2, 3))
            
            # Break if no exceptions are raised
            else:
                break
            
    return talk_url_list

In [None]:
def construct_url():
    """
    Construct complete url for each talk's transcript page.
    Removes '?language=en' from the end of the url.
    """
    return ['https://www.ted.com' + url.replace('?language=en', '') + '/transcript' for url in get_talk_urls()]

In [None]:
def get_data():
    """
    Get title, speaker, related tags, and transcript from each talk's transcript page.
    Returns data in a nested dictionary.
    """
    # Create empty dict to nest dicts (1 nested dict per talk)
    ted_dict = {}
    
    # 1. Used to create nested dicts in 'ted_dict'
    # 2. Used as count for successfully scraped pages
    dict_id = 0
    
    # Counter for failed scraped pages
    failed_counter = 0
    
    # Iterate through each ted talk transcript url
    for url in construct_url():
        # Make soup
        soup = make_soup(url)

        # Delay between searches
        time.sleep(random.uniform(.5, 2))
        
        # Try up to three attempts to scrape data
        for attempt in range(1, 3+1):
            try:    
                # Get title
                title_tag = soup.find(attrs={'name': 'title'})

                # Get speaker
                speaker_tag = soup.find(attrs={'name': 'author'})

                # Get related tags
                match_obj = re.search(r"\"tag\":\"(.*?)\"", soup.text)
                match_obj.group(1)
                ted_tags = match_obj.group(1).split(',')

                # Get talk description
                desc_tag = soup.find(attrs={'property': 'og:description'})
                desc_str = desc_tag.attrs['content'].split(': ', 1)[1] # Split description at "TED Talk Subtitles and Transcript:"

                # Get transcript
                transcript = ''
                transcript_strings = []
                for div in soup.find_all('div', class_="Grid__cell flx-s:1 p-r:4"):
                    for p in div.find_all('p'):
                        # Add every string in the transcript to a list
                        transcript_strings.append(" ".join(p.text.split()))
                    else:
                        # After all strings have been added, create a single transcript string
                        transcript = " ".join(transcript_strings)

                # Add 1 to create a new dict_id and use it to create a nested dict
                dict_id += 1
                ted_dict[dict_id] = {}

                # Add the features above to the nested dict
                ted_dict[dict_id]['title'] = title_tag.attrs['content'].split(':')[1].strip()
                ted_dict[dict_id]['speaker'] = speaker_tag.attrs['content']
                ted_dict[dict_id]['tags'] = ted_tags
                ted_dict[dict_id]['description'] = desc_str
                ted_dict[dict_id]['transcript'] = transcript
                
                # Indicate successfull scrape
                print(dict_id, url)

            except Exception as e:
                # If the last attempt fails, update the failed counter and print the exception & talk url for debugging
                if attempt == 3:
                    failed_counter += 1
                    print(f'position: {dict_id}, exception: {e}, url: {url}\n')
                    continue
                
                # Delay before another attempt
                time.sleep(random.uniform(4, 6))
            
            # Break if no exceptions are raised
            else:
                break

    print(f"""Ted.com scraping results:
        \n\t• Success: {dict_id}
        \n\t• Failed: {failed_counter}\n""")
    
    return ted_dict

In [None]:
# ted_dict = get_data()
# print(ted_dict)

## Create dataframe

In [None]:
# Save pickle
# with open('data/ted_dict_complete.pkl', 'wb') as f:
#     pickle.dump(ted_dict, f)

# Load pickle
with open('data/ted_dict_complete.pkl', 'rb') as f:
    ted_dict = pickle.load(f)
    print(f"# of dictionaries in:\n •'ted_dict': {len(ted_dict)}")

In [None]:
# Load all data scraped from TED

# Create dataframe
# df = pd.DataFrame.from_dict(ted_dict, orient='index')

# Pickle dataframe
# df.to_pickle('data/first_df.pkl')

# Load dataframe from pickle
df = pd.read_pickle('data/first_df.pkl')
print(df.shape)

We were able to successfully scrape 3904 talks (as of February 21, 2020)!  

Only one talk failed as the transcript doesn't load, maybe we should notify TED about this bug...  
https://www.ted.com/talks/marcus_bullock_an_app_that_helps_incarcerated_people_stay_connected_to_their_families/transcript


# Data Cleaning

* Load data and create a dataframe
* Reduce scope
* Clean data, remove:
    * text in square brackets & parenthesis
    * punctuation
    * words containing numbers
    * double-quotes, dashes
* Lemmatization (spaCy)
* Save stop words (spaCy)
* Create corpus from nouns & adjectives (spaCy)
    * One for each tag: sex, religion, politics
* Create document-term matrix

## Load data and create a dataframe

In [2]:
# Load pickled dictionary of scraped data
with open('data/ted_dict_complete.pkl', 'rb') as f:
    ted_dict = pickle.load(f)
    print(f"# of dictionaries in:\n •'ted_dict': {len(ted_dict)}\n")

    # Create a dataframe
    df = pd.DataFrame.from_dict(ted_dict, orient='index')
    f.close()

# Pickle dataframe
df.to_pickle('data/first_df.pkl')

# Load dataframe from pickle
df = pd.read_pickle('data/first_df.pkl')
print(f'Shape: {df.shape}')

# of dictionaries in:
 •'ted_dict': 3904

Shape: (3904, 5)


## Reduce Scope
Filter for talks about `sex`, `religion`, and `politics`

https://stackoverflow.com/questions/16575868/efficiently-creating-additional-columns-in-a-pandas-dataframe-using-map

In [3]:
def check_tag(tag):
    """Input tag and return a list of booleans for talks that contain 'tag' by index"""
    contains_tag = []
    for ix, tag_list in enumerate(df['tags']):
        if any(tag in t for t in tag_list):
            contains_tag.append(1)
        else:
            contains_tag.append(0)
    return contains_tag

In [4]:
# Add series for selected tags
df['is_sex'] = check_tag('sex')
df['is_religion'] = check_tag('religion')
df['is_politics'] = check_tag('politics')

In [5]:
# Filter dataframe to only include talks for the selected tags
df = df.loc[(df['is_sex']==1) | (df['is_religion']==1) | (df['is_politics']==1), : ].reset_index()

### Create new dataframe with reduced scope

In [6]:
def combine_transcripts(list_of_text):
    """Input a list of text and return them into one large chunk of text."""
    combined_text = ' '.join(list_of_text)
    return combined_text

In [7]:
def transcripts_to_dict(dataframe, tag_list):
    """Creates a dictionary of transcripts for each tag.
    Input a dataframe and tag list.
    Return a nested dictionary.
    """
    ted_dict = {}
        
    # Assign parameter to tags
    tags = tag_list
    for tag in tag_list:
        # Filter dataframe to specific series and convert it to a list
        filter_string = 'is_' + str(tag)
        text_list = dataframe.loc[(df[filter_string]==1), 'transcript'].to_list()

        # Call combine_transcripts function to return combined text
        combined_text = combine_transcripts(text_list)

        # Add combined text to dict
        ted_dict[tag] = combined_text
    return ted_dict

In [8]:
# Create dictionary from the dataframe
transcript_dict = transcripts_to_dict(df, ['sex', 'religion', 'politics'])

# Construct dataframe from dictionary 
df = pd.DataFrame.from_dict(transcript_dict, orient='index')
df.rename({0: 'transcript'}, axis=1, inplace=True)

## Clean text

In [10]:
def clean_text(text):
    '''
    Remove:
        *text in square brackets & parenthesis
        *punctuation
        *words containing numbers
        *double-quotes, dashes
    '''
#     text = text.lower()
    text = re.sub('[\[\(].*?[\)\]]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[\“\–]', '', text)
    return text

In [11]:
# Clean text
df['transcript'] = pd.DataFrame(df['transcript'].apply(lambda x: clean_text(x)))
# df['description'] = pd.DataFrame(df['description'].apply(lambda x: clean_text(x)))
# df['title'] = pd.DataFrame(df['title'].apply(lambda x: clean_text(x)))

In [109]:
# # Pickle dataframe
# df.to_pickle('data/sex_religion_politics_corpus.pkl')

# Load dataframe
df = pd.read_pickle('data/sex_religion_politics_corpus.pkl')
print(f'Shape: {df.shape}')

Shape: (3, 1)


## Corpus

### Process the corpus with spaCy


In [16]:
# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Transform dataframe to list of docs (corpus)
texts = df.loc[:, 'transcript'].to_list()

# Process the corpus
docs = list(nlp.pipe(texts)) # disable=['tagger', 'parser', 'textcat']

In [110]:
# # Pickle docs
# with open('data/docs_transcript.pkl', 'wb') as f:
#     pickle.dump(docs, f)
#     f.close()

# Load pickled spaCy 'docs' object
with open('data/docs_transcript.pkl', 'rb') as f:
    docs = pickle.load(f)
    f.close()

In [111]:
def get_stop_words(spaCy_doc_obj):
    """Input a Doc and return a list of spaCy stop words"""
    stop_words = []
    for doc in spaCy_doc_obj:
        for token in doc:
            if token.is_stop:
                stop_words.append(token.text.lower())
    return set(stop_words)

In [112]:
# spaCy stop words from transcripts
spacy_stop_words = get_stop_words(docs)

# # Pickle spacy_stop_words for later use
# with open('data/spacy_stop_words.pkl', 'wb') as f:
#     pickle.dump(spacy_stop_words, f)
#     f.close()

### Corpus: Nouns & adjectives
Only includes nouns & adjectives

In [114]:
def get_nouns_adj(spaCy_doc, tag_list):
    """
    Input a spaCy Doc object (corpus) and tag list.
    Return a dictionary of lemmatized nouns and adjectives per doc.
    """
    tags = tag_list
    my_dict = {}
    for ix, doc in enumerate(spaCy_doc):
        tag = tags[ix]
        token_list = []
        for token in doc:
            if token.pos_ in ['NOUN', 'ADJ']:
                token_list.append((token.lemma_).lower())
        my_dict[tag] = ' '.join(token_list)
    return my_dict

In [115]:
# Create nouns and adjective dictionary
nouns_adj_dict = get_nouns_adj(docs, ['sex', 'religion', 'politics'])

In [116]:
# Construct dataframe
nouns_adj_df = pd.DataFrame.from_dict(nouns_adj_dict, orient='index')
nouns_adj_df.rename({0: 'transcript'}, axis=1, inplace=True)
nouns_adj_df.head()

Unnamed: 0,transcript
sex,teen terrible period crippling cramp blood clo...
religion,primordial destroyer evil slayer demon protect...
politics,team researcher math test exam american adult ...


In [117]:
# # Pickle dataframe
# nouns_adj_df.to_pickle('data/processed_transcript_df.pkl')

# # Load dataframe
# nouns_adj_df = pd.read_pickle('data/processed_transcript_df.pkl')
# print(f'Shape: {df.shape}')

## Document-Term Matrix

In [118]:
# Use spaCy stop words
cv = CountVectorizer(stop_words=spacy_stop_words)
data_cv = cv.fit_transform(nouns_adj_df['transcript'])
dtm_df = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
dtm_df.index = df.index
dtm_df

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,abathembu,abaya,abbreviation,abduction,aberration,abhorrent,ability,able,ablebodied,ableism,...,zipper,zombie,zone,zoning,zoo,zucchini,zygote,élan,état,送你葱
sex,0,0,1,1,0,1,20,62,0,0,...,0,0,2,1,0,1,1,1,0,0
religion,0,1,0,0,0,1,16,54,1,0,...,0,1,1,1,0,0,0,0,0,0
politics,1,0,0,0,1,1,80,203,0,1,...,3,5,34,2,5,0,0,0,2,1


In [119]:
# # Pickle document-term matrix
# dtm_df.to_pickle('data/dtm_nouns_adj.pkl')

# Load document-term matrix
dtm_df = pd.read_pickle('data/dtm_nouns_adj.pkl')
print(f'Shape: {dtm_df.shape}')

Shape: (3, 12385)


# Exploratory Data Analysis

* Word Clouds

## Most Common Words

### Analysis

In [140]:
# Read in the document-term matrix
data = pd.read_pickle('data/dtm_nouns_adj.pkl')
data = dtm_df.transpose()
data.head()

Unnamed: 0,abathembu,abaya,abbreviation,abduction,aberration,abhorrent,ability,able,ablebodied,ableism,...,zipper,zombie,zone,zoning,zoo,zucchini,zygote,élan,état,送你葱
sex,0,0,1,1,0,1,20,62,0,0,...,0,0,2,1,0,1,1,1,0,0
religion,0,1,0,0,0,1,16,54,1,0,...,0,1,1,1,0,0,0,0,0,0
politics,1,0,0,0,1,1,80,203,0,1,...,3,5,34,2,5,0,0,0,2,1


In [123]:
# Find the top words said by each tag
n_words = 15
top_dict = {}
for tag in data.columns:
    top = data[tag].sort_values(ascending=False).head(n_words)
    top_dict[tag]= list(zip(top.index, top.values))
    
# Print the top words said by each tag
for tag, top_words in top_dict.items():
    print(tag)
    print(', '.join([word for word, count in top_words[0:n_words]]))
    print('---')

sex
woman, sex, people, man, thing, time, sexual, way, year, girl, lot, child, male, good, life
---
religion
people, world, thing, way, time, life, religion, year, good, human, man, religious, idea, compassion, day
---
politics
people, world, year, thing, country, way, time, power, good, political, government, new, idea, problem, woman
---


## Update stop words

In [124]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top words for each tag
words = []
for tag in data.columns:
    top = [word for (word, count) in top_dict[tag]]
    for t in top:
        words.append(t)
        
words

['woman',
 'sex',
 'people',
 'man',
 'thing',
 'time',
 'sexual',
 'way',
 'year',
 'girl',
 'lot',
 'child',
 'male',
 'good',
 'life',
 'people',
 'world',
 'thing',
 'way',
 'time',
 'life',
 'religion',
 'year',
 'good',
 'human',
 'man',
 'religious',
 'idea',
 'compassion',
 'day',
 'people',
 'world',
 'year',
 'thing',
 'country',
 'way',
 'time',
 'power',
 'good',
 'political',
 'government',
 'new',
 'idea',
 'problem',
 'woman']

In [125]:
# Let's aggregate this list and identify the most common words
Counter(words).most_common()

[('people', 3),
 ('thing', 3),
 ('time', 3),
 ('way', 3),
 ('year', 3),
 ('good', 3),
 ('woman', 2),
 ('man', 2),
 ('life', 2),
 ('world', 2),
 ('idea', 2),
 ('sex', 1),
 ('sexual', 1),
 ('girl', 1),
 ('lot', 1),
 ('child', 1),
 ('male', 1),
 ('religion', 1),
 ('human', 1),
 ('religious', 1),
 ('compassion', 1),
 ('day', 1),
 ('country', 1),
 ('power', 1),
 ('political', 1),
 ('government', 1),
 ('new', 1),
 ('problem', 1)]

Additional stop words:
* talk, sex, sexual, religion, religious, political, politics

In [126]:
# If all three tags have the top word, exclude it
add_stop_words = [word for word, count in Counter(words).most_common() if count > 2]
add_stop_words

['people', 'thing', 'time', 'way', 'year', 'good']

In [127]:
with open('data/spacy_stop_words.pkl', 'rb') as f:
    stop_words = list(pickle.load(f))
    f.close()

In [128]:
def update_stop_words(list_to_update, add_stop_words):
    """Add custom stop words to stop word list"""
    stop_words = list_to_update
    for word in add_stop_words:
        stop_words.append(word)
    return stop_words

In [129]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Load clean dataframe
nouns_adj_df = pd.read_pickle('data/processed_transcript_df.pkl')

# Add new stop words
stop_words = update_stop_words(stop_words, add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(nouns_adj_df['transcript'])
dtm_df = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
dtm_df.index = nouns_adj_df.index

# Pickle it for later use
with open('data/cv_stop.pkl', 'wb') as f:
    pickle.dump(cv, f)
    f.close()

  'stop_words.' % sorted(inconsistent))


## Word clouds

In [130]:
# Load clean df
nouns_adj_df = pd.read_pickle('data/processed_transcript_df.pkl')

In [131]:
from wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color='white', colormap='Dark2',
               max_font_size=150, random_state=42)

In [137]:
# Reset the output dimensions
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [16, 6]

tags = ['sex', 'religions', 'politics']

# Create subplots for each comedian
for index, tags in enumerate(data.columns):
    wc.generate(nouns_adj_df.transcript[tag])
    
    plt.subplot(3, 1, index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(tags[index])
    
plt.show()

NameError: name 'data' is not defined

# Topic Modeling

## NMF / LDA
Non-Negative Matrix Factorization (NMF)

Find two non-negative matrices (W, H) whose product approximates the non-
negative matrix X. This factorization can be used for example for
dimensionality reduction, source separation or topic extraction.

* LDA tends to work better on larger documents


In [None]:
no_topics = 10

nmf_model = NMF(n_components=no_topics)
doc_topic = nmf_model.fit_transform(doc_word)

In [None]:
component_ix = []
for i in range(1, no_topics+1):
    component_ix.append('component_' + str(i))
    
topic_word = pd.DataFrame(nmf_model.components_.round(3),
                          index=component_ix,
                          columns=vectorizer.get_feature_names())
topic_word

In [None]:
# Create df for topics
no_top_words = 10
topic_df = pd.DataFrame()
for ix, topic in enumerate(nmf_model.components_):
    col = 'topic_' + str(ix+1)
    topic_df[col] = pd.Series(vectorizer.get_feature_names()[i] for i in topic.argsort()[:-no_top_words - 1:-1])

topic_df

The H matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. In this case:
- The first document seems to be about 'X'
- The last document seems to be about 'Y'
- Everything in between is a combination of the two

In [None]:
H = pd.DataFrame(doc_topic.round(3),
                 index=df.loc[:20, 'title'],
                 columns=component_ix)
H

# Clustering

# Visualizations
Use umap for visualizations, clustering, and reducing dimensionality

– to scrape all the links look for the hrefs of the homepage
– do cleaning of which ones are not for videos
– 

# spaCy attributes

* Text: The original word text.
* Lemma: The base form of the word.
* POS: The simple part-of-speech tag.
* Tag: The detailed part-of-speech tag.
* Dep: Syntactic dependency, i.e. the relation between tokens.
* Shape: The word shape – capitalization, punctuation, digits.
* is alpha: Is the token an alpha character?
* is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [32]:
# Print token attributes
for doc in docs:
    print('{:<25}{:<25}{:<10}{:<10}{:<15}{:<10}{:<10}{:<10}'.format(
    'Text', 'Lemma', 'POS', 'Tag', 'Dependency', 'Shape', 'Alpha', 'Stop'))
    for token in doc:
        # This is for formatting only
        print('{:<25}{:<25}{:<10}{:<10}{:<15}{:<10}{:<10}{:<10}'.format(
            token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop))
    print('\n')

Text                     Lemma                    POS       Tag       Dependency     Shape     Alpha     Stop      
When                     when                     ADV       WRB       advmod         Xxxx      1         1         
I                        -PRON-                   PRON      PRP       nsubj          X         1         1         
was                      be                       AUX       VBD       ROOT           xxx       1         1         
a                        a                        DET       DT        det            x         1         1         
teen                     teen                     NOUN      NN        attr           xxxx      1         0         
I                        -PRON-                   PRON      PRP       nsubj          X         1         1         
had                      have                     AUX       VBD       relcl          xxx       1         1         
terrible                 terrible                 ADJ       JJ        am

KeyboardInterrupt: 

In [None]:
# Iterate through each doc in the corpus
for ix, doc in enumerate(docs):
    # Print each doc (transcript)
    print(f'\nDoc: {ix}\n{doc}\n')
    
    # Iterate through the predicted entities
    # Print the entity text and its label
    print(
        tabulate([(ent.text, ent.label_) for ent in doc.ents],
                 headers=['Entity', 'Label'],
                 tablefmt='github'))
    
    # Extract labels from entities and store
    labels = [ent.label_ for ent in doc.ents]
    
    # Iterate through the predicted labels
    # Print the label and its count
    print('\n')
    print(
        tabulate([(count, label) for label, count in Counter(labels).items()],
                 headers=['Count', 'Label'],
                 tablefmt='github'))

# Stanford NLP + SpaCy

In [None]:
import stanfordnlp
from spacy_stanfordnlp import StanfordNLPLanguage

snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")

In [None]:
print('{:<25}{:<25}{:<10}{:<10}{:<15}{:<10}{:<10}{:<10}'.format(
'Text', 'Lemma', 'POS', 'Tag', 'Dependency', 'Shape', 'Alpha', 'Stop'), '\n')

for token in doc:
    # This is for formatting only
    print('{:<25}{:<25}{:<10}{:<10}{:<15}{:<10}{:<10}{:<10}'.format(
        token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
        token.shape_, token.is_alpha, token.is_stop))
print('\n')

In [None]:
from spacy import displacy
displacy.render(doc)