In [1]:
#%pip install gutenberg_cleaner
from urllib.request import urlopen 
from gutenberg_cleaner import simple_cleaner

In [None]:
#url = 'https://www.gutenberg.org/files/2600/2600-0.txt'  # Verified working
wnp = urlopen('https://www.gutenberg.org/files/2600/2600-0.txt').read().decode('utf-8')
wnp = simple_cleaner(wnp)
wnp[:100]

In [None]:
wnp = wnp.lower()

In [None]:
import re 
wnp = re.sub(r'\s+', ' ', wnp).strip()

In [None]:
import string 
wnp = wnp.translate(str.maketrans('', '', string.punctuation))

In [None]:
wnp = re.sub(r'\w+@\w+\.\w+', '', wnp)

In [None]:
from sklearn.feature_extraction import _stop_words 
non_stopwords = [] 
for word in wnp.split():  
        if word not in _stop_words.ENGLISH_STOP_WORDS: 
                non_stopwords.append(word) 
cleaned_text = ' '.join(non_stopwords)

In [None]:
import nltk 
nltk.download('stopwords')

In [None]:
nltk.corpus.stopwords.words('english')

In [None]:
from nltk.stem import SnowballStemmer 
stemmer = SnowballStemmer('english') 
stemmed_words = [] 
for word in cleaned_text.split(): 
    stemmed_words.append(stemmer.stem(word)) 
stemmed_text = ' '.join(stemmed_words)

In [None]:
#%pip install spacy

In [None]:
import spacy 
spacy_en_model = spacy.load('en_core_web_sm', disable=['parser', 'ner']) 
spacy_en_model.max_length = 4000000 
def clean_text_spacy(text): 
    processed_text = spacy_en_model(text) 
    lemmas = [w.lemma_ if w.lemma_ != '-PRON-' 
              else w.lower_ for w in processed_text 
              if w.is_alpha and not w.is_stop] 
    return ' '.join(lemmas).lower() 
wnp = urlopen('https://www.gutenberg.org/files/2600/2600-0.txt').read().decode('utf-8') 
wnp = simple_cleaner(wnp) 
lemmatized_text = clean_text_spacy(wnp) 

In [None]:
#!python -m spacy download en_core_web_sm

In [None]:
#!python -m spacy download en_core_web_lg

In [None]:
spacy_en_model = spacy.load('en_core_web_sm') 
spacy_en_model.pipe_names 

In [None]:
import spacy 
spacy_en_model = spacy.load('en_core_web_lg', disable=['parser', 'ner']) 
spacy_en_model.max_length = 4000000 
processed_text = spacy_en_model(wnp)

In [None]:
for word in processed_text[:10]: 
    print(word.text, word.vector)

In [None]:
from sklearn.datasets import fetch_20newsgroups 
newsgroups_train = fetch_20newsgroups(remove=('headers', 'footers'))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf_vectorizer = TfidfVectorizer() 
ng_train_tfidf = tfidf_vectorizer.fit_transform(newsgroups_train['data'])

In [None]:
import pandas as pd 
import swifter 
ng_train_df = pd.DataFrame({'text': newsgroups_train['data'], 
'label': 
newsgroups_train['target']}) 
ng_train_df['text'] = ng_train_df['text'].swifter.apply(clean_text_spacy) 
tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=0.9) 
ng_train_tfidf = tfidf_vectorizer.fit_transform 
(ng_train_df['text'])

In [None]:
from pycaret.nlp import setup, plot_model 
nlp_setup = setup(newsgroups_train['data'], custom_stopwords= ['ax', 'edu', 'com', 'write']) 
plot_model(model=None, plot='frequency')

In [None]:
from nltk import FreqDist 
fd = FreqDist(lemmatized_text.split()) 
fd.plot(20)

In [None]:
from nltk import bigrams 
fd_bg = FreqDist(map(' '.join, 
bigrams(lemmatized_text.split()))) 
fd_bg.plot(20)

In [None]:
import matplotlib.pyplot as plt 
import numpy as np 
idx_to_word = {v: k for k, v in 
tfidf_vectorizer.vocabulary_.items()} 
num_words = 20 
tfidf_sum = np.asarray(ng_train_tfidf.sum(axis=0)).flatten() 
sorted_idx = tfidf_sum.argsort()[::-1] 
tfidf_sum = tfidf_sum[sorted_idx] 
xticks = range(num_words) 
plt.bar(xticks, tfidf_sum[:num_words]) 
plt.xticks(xticks, 
           [idx_to_word[i] for i in sorted_idx[:num_words]], 
           rotation=90) 
plt.xlabel('word') 
plt.ylabel('TFIDF')

In [None]:
import seaborn as sns 
sns.histplot(ng_train_df['text'].apply(lambda x: len(x))) 
plt.xscale('log')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
def get_top_grams(docs, n=2): 
    v = CountVectorizer(ngram_range=(n, n)) 
    grams = v.fit_transform(docs) 
    gram_sum = np.array(np.sum(grams, axis=0)).flatten() 
    gram_dict = {i: v for v, i in v.vocabulary_.items()} 
    top_grams = gram_sum.argsort()[::-1] 
     
    return [gram_dict[i] for i in top_grams], gram_sum[top_grams]

In [None]:
ngrams, ngram_counts = {}, {} 
for n in [1, 2, 3]: 
    ngrams[n], ngram_counts[n] = 
get_top_grams([lemmatized_text], n=n)

In [None]:
from scipy.stats import zipf 
def make_zipf_plot(counts, tokens, a=1.15): 
    ranks = np.arange(1, len(counts) + 1) 
    indices = np.argsort(-counts) 
    normalized_frequencies = counts[indices] / sum(counts) 
    f = plt.figure(figsize=(5.5, 5.5)) 
    plt.loglog(ranks, normalized_frequencies, marker=".") 
     
    plt.loglog(ranks, [z for z in zipf.pmf(ranks, a)])
    plt.title("Zipf Plot") 
    plt.xlabel("Word frequency rank") 
    plt.ylabel("Word frequency") 
    ax = plt.gca() 
    ax.set_aspect('equal')  # make the plot square 
    plt.grid(True) 
     
    # add text labels 
    last_freq = None 
    labeled_word_idxs = list(np.logspace(-0.5, 
                                         np.log10(len(counts) - 1), 
                                         10).astype(int)) 
    for i in labeled_word_idxs: 
        dummy = plt.text(ranks[i], 
                         normalized_frequencies[i], 
                         " " + tokens[indices[i]], 
                          verticalalignment="bottom", 
                         horizontalalignment="left") 
    plt.show()

In [None]:
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder 
BigramCollocationFinder.from_words(lemmatized_text.split()).nbest(BigramAssocMeasures().pmi, 10)

In [None]:
BigramCollocationFinder.from_words(lemmatized_text.split()).score_ngrams(BigramAssocMeasures().pmi)

In [None]:
pos_dict = {} 
pos_dict['word'] = [] 
pos_dict['POS'] = [] 
for word in processed_text: 
    if word.is_alpha and not word.is_stop: 
        pos_dict['word'].append(word.lower_) 
        pos_dict['POS'].append(word.pos_) 
wnp_pos_df = pd.DataFrame(pos_dict)

In [None]:
pos_counts = wnp_pos_df.groupby('POS').count().sort_values(by='word', ascending=False).head(10) 
pos_counts.columns = ['count'] 
wnp_pos_df['count'] = 1 
wnp_pos_df.groupby(['POS', 'word']).count().sort_values(by='count', ascending=False).head(10)

In [None]:
list(zip(newsgroups_train['target_names'], 
range(len(newsgroups_train['target_names']))))

In [None]:
space_ng = ng_train_df[ng_train_df['label'] == 
14].copy().reset_index()

In [None]:
from pycaret.nlp import setup, create_model, plot_model, assign_model 
space_setup = setup(space_ng, target='text')

In [None]:
lda = create_model('lda') 
plot_model(lda, 'topic_model')

In [None]:
lda_results = assign_model(lda)

In [None]:
from gensim.models import CoherenceModel 
cm = CoherenceModel(model=lda, 
texts=lda_results['text'].map(str.split).tolist(), dictionary=lda.id2word) 
cm.get_coherence()

In [None]:
coherences = [] 
for num_topics in range(2, 16): 
    lda = create_model('lda', num_topics=num_topics) 
    lda_results = assign_model(lda) 
    cm = CoherenceModel(model=lda,  
texts=lda_results['text'].map(str.split).tolist(),  
                        dictionary=lda.id2word) 
    coherences.append(cm.get_coherence())

In [None]:
plt.plot(range(2, 16), coherences) 
plt.xlabel('number of LDA topics') 
plt.ylabel('coherence score')

In [None]:
from top2vec import Top2Vec 
raw_ng_df = pd.DataFrame({'text': newsgroups_train['data'],  'label': newsgroups_train['target']}) 
raw_space_df = raw_ng_df[raw_ng_df['label'] == 14] 
model = Top2Vec(documents=raw_space_df['text'].to_list(), 
workers=8)

In [None]:
newsgroups_train = fetch_20newsgroups(remove=('headers', 
'footers')) 
newsgroups_test = fetch_20newsgroups(subset='test',  
                                     remove=('headers', 
'footers')) 
en_large = spacy.load('en_core_web_lg', disable=['parser', 
'ner']) 
def get_document_vectors(text): 
    processed = en_large(text) 
    return processed.vector 
ng_train_df = pd.DataFrame({'text': newsgroups_train['data'], 
                            'label': 
newsgroups_train['target']}) 
ng_train_doc_vectors = pd.DataFrame( 
    np.vstack(ng_train_df['text']. 
              swifter.apply(get_document_vectors).tolist()) 
) 
ng_test_df = pd.DataFrame({'text': newsgroups_test['data'], 
                           'label': 
newsgroups_test['target']}) 
ng_test_doc_vectors = pd.DataFrame( 
    np.vstack(ng_test_df['text']. 
              swifter.apply(get_document_vectors).tolist()) 
) 

In [None]:
ng_train_vector_df = pd.concat([ng_train_df['label'].astype('category'), 
                                ng_train_doc_vectors], axis=1) 
ng_test_vector_df = pd.concat([ng_test_df['label'].astype('category'), 
                               ng_test_doc_vectors], axis=1)

In [None]:
vectorizer = TfidfVectorizer(min_df=10, max_df=0.9) 
train_tfidf = vectorizer.fit_transform(newsgroups_train['data']) 
test_tfidf = vectorizer.transform(newsgroups_test['data']) 
train_tfidf_df = pd.DataFrame(train_tfidf.todense()) 
test_tfidf_df = pd.DataFrame(test_tfidf.todense()) 
train_tfidf_df['label'] = pd.Series(newsgroups_train['target']).astype('category') 
test_tfidf_df['label'] = pd.Series(newsgroups_test['target']).astype('category')

In [None]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression() 
lr.fit(ng_train_vector_df.drop('label', axis=1), 
ng_train_vector_df['label']) 
lr.score(ng_train_vector_df.drop('label', axis=1),  
         ng_train_vector_df['label'])

In [None]:
lr = LogisticRegression() 
lr.fit(train_tfidf_df.drop('label', axis=1), 
train_tfidf_df['label']) 
lr.score(train_tfidf_df.drop('label', axis=1), 
train_tfidf_df['label'])

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
vader = SentimentIntensityAnalyzer() 
def get_sentiment(text): 
    return vader.polarity_scores(text.lower())['compound'] 
ng_train_df['sentiment_score'] = ng_train_df['text'].swifter.apply(get_sentiment)

In [None]:
label_dict = {i: label for i, label in 
              enumerate(newsgroups_train['target_names'])} 
ng_train_df['label'].replace(label_dict, inplace=True)

In [None]:
ng_train_df.groupby('label').mean().sort_values(by='sentiment_score', ascending=False)

In [None]:
ng_train_df[(ng_train_df['label'] == 'talk.politics.guns') &  
            (ng_train_df['sentiment_score'] < -0.5)].sample(3, random_state=42)['text'].tolist()

In [None]:
import seaborn as sns 
guns_hockey_df = ng_train_df[ng_train_df['label'].isin(['talk.politics.guns', 
'rec.sport.hockey'])] 
sns.histplot(guns_hockey_df, 
             x='sentiment_score', 
             hue='label')