-
Notifications
You must be signed in to change notification settings - Fork 0
/
Word_Counting.py
78 lines (65 loc) · 2.79 KB
/
Word_Counting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import collections
import itertools
import nltk
import spacy
import pandas as pd
import textcleaner as tc
from docutils.nodes import section
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize,word_tokenize
import gensim
import pandas as pd
import string
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from random import choice
nlp = spacy.load('en_core_web_sm')
def sentence_tokenizer(sentences:str):
raw_sentences = []
doc = nlp(sentences)
for i, token in enumerate(doc.sents):
print('-->Sentence %d: %s' % (i, token.text))
raw_sentences.append(token.text)
return raw_sentences
def text_preprocessing(sentences:[str]):
input_text = list(tc.document(sentences).remove_numbers().remove_stpwrds().remove_symbols().lower_all())
lema = new_lemmatization(sentences=input_text)
return lema
# def lemmatization(sentences:[str], allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
# """https://spacy.io/api/annotation"""
# texts_out = []
# for sent in sentences:
# doc = nlp(sent.lower())
# texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
# return texts_out
def new_lemmatization(sentences:[str],allowed_postags=['NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN','VBP', 'VBZ', 'JJ', 'JJR', 'JJS']):
texts_out = []
for sent in sentences:
tokens = word_tokenize(sent)
tagged = nltk.pos_tag(tokens)
pos_cleaned_sent = " ".join([token for (token, pos) in tagged if pos in allowed_postags])
doc = nlp(pos_cleaned_sent)
# Extract the lemma for each token and join
texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc]))
return texts_out
def remove_null_sentence(sentences:[str]):
return [x for x in sentences if x is not '']
def word_tokenizer(sentences:[str]):
sentence=[]
for raw_sentence in sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, get a list of words
sentence.append(word_tokenize(raw_sentence))
return sentence
# Return the list of sentences
# so this returns a list of lists
def word_count_per_document(input_sentences):
dictionary = Dictionary(input_sentences)
return pd.DataFrame.from_dict({dictionary[id]:dictionary.dfs[id] for id in dictionary.dfs},orient='index')
df = pd.read_excel('Tickets.xlsx').values.tolist()
sentence = list(itertools.chain.from_iterable(df))
preprocessed_sentences = remove_null_sentence(text_preprocessing(sentences=sentence))
input_sentence = word_tokenizer(preprocessed_sentences)
result = word_count_per_document(input_sentence)
result.plot.line(figsize=(8,10),style='.-',)