In [None]:
import sys
import PyPDF2
import pandas as pd
import spacy
from tabula.io import read_pdf
import re

1. look for Tesla's high freq. terms and check the difference compared to NIO、XPeng、Li

In [71]:
nlp = spacy.load('en_core_web_sm')
def show_ner_with_page(text: str):
    '''
    This function requires nlp preloaded
    :param text: the text
    :return: returns a display dataframe with all the entity included
    '''
    display_df = pd.DataFrame()
    if nlp(text).ents:
        for ent in nlp(text).ents:
            val = [ent.text, ent.start_char, ent.end_char, ent.label_]
            df = pd.DataFrame(val).transpose()
            display_df = display_df.append(df, ignore_index=True)

    display_df.columns = ['entities', 'start_char', 'end_char', 'label']
    return display_df

In [62]:
def testGetter(file: str, cut: int):
    '''
    financial report is too big, we can use a few pages for the test.
    for example, if cut is 100, then we only get the first 1% pages of the file.
    :param file: input the file name
    :return: the output is a part of text file
    '''
    f = open(file, 'r')
    text = f.read()
    leng = int(len(text) / cut)

    return text[:leng].replace('\n', '')
# testGetter('Li_AnnualReport_2020.txt', 100)

In [81]:
# this line shows the NER
show_ner_with_page(testGetter('Report/Li_AnnualReport_2020.txt', cut=100))



Unnamed: 0,entities,start_char,end_char,label
0,D.C.,76,80,GPE
1,20,91,93,CARDINAL
2,REGISTRATION STATEMENT PURSUANT,107,138,PERSON
3,1934For the fiscal year ended December 31,286,327,DATE
4,1934For,421,428,CARDINAL
...,...,...,...,...
140,H.10,9600,9604,GPE
141,The Board of Governors,9628,9650,ORG
142,the Federal Reserve System,9654,9680,ORG
143,U.S.,9774,9778,GPE


In [137]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_lg')
nlp.max_length = 2000000
class TextRank4Keyword():
    """Extract keywords from text"""

    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight


    def set_stopwords(self, stopwords):
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences

    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab

    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs

    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())

    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1

        # Get Symmeric matrix
        g = self.symmetrize(g)

        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm

        return g_norm


    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        res = []
        for i, (key, value) in enumerate(node_weight.items()):
            # print(key + ' - ' + str(value))
            res.append([key, str(value)])
            if i > number:
                break

        return res


    def analyze(self, text,
                candidate_pos=['NOUN', 'PROPN'],
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""

        # Set stop words
        self.set_stopwords(stopwords)

        # Pare text by spaCy
        doc = nlp(text)

        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words

        # Build vocabulary
        vocab = self.get_vocab(sentences)

        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)

        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)

        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]

        self.node_weight = node_weight

In [127]:
def keywordGetter(filepath: str, file_cut: int,  candidate_pos=['NOUN', 'PROPN'], numOfKeywords=50):
    if file_cut == 0:
        # we'll use the whole file
        f = open(filepath, 'r')
        text4Rank = f.read()
        f.close()
        tr4w = TextRank4Keyword()
        tr4w.analyze(text4Rank, candidate_pos=candidate_pos, window_size=4, lower=False)
    else:
        # test mode
        text4Rank = testGetter(filepath, file_cut)
        tr4w = TextRank4Keyword()
        tr4w.analyze(text4Rank, candidate_pos=candidate_pos, window_size=4, lower=False)

    res = tr4w.get_keywords(numOfKeywords)

    return res

In [156]:
# this cell will take about 10 min to run
keyword_Tesla = keywordGetter('Report/Tesla_AnnualReport_2020.txt', 0)
keyword_Li = keywordGetter('Report/Li_AnnualReport_2020.txt', 0)
keyword_NIO = keywordGetter('Report/NIO_AnnualReport_2020.txt', 0)
keyword_XPeng = keywordGetter('Report/XPeng_AnnualReport_2020.txt', 0)

In [152]:
def string_compare(firmA, firmB):

    shared = []
    keywordA = [x[0] for x in firmA]
    keywordB = [x[0] for x in firmB]
    for word in keywordA:
        if word in keywordB:
            shared.append(word)

    temp = set(keywordA) | set(keywordB)
    difference = list(temp - set(shared))

    return [shared, difference]

In [160]:
string_compare(keyword_Tesla, keyword_Li)[1]

['year',
 'energy',
 'securities',
 'China',
 'lease',
 'period',
 'stock',
 'Model',
 'Solar',
 'expenses',
 'shareholders',
 'products',
 'service',
 'tsla-20211231.htm',
 'Shares',
 'Contents',
 'Table',
 'requirements',
 'edgar',
 'share',
 'rights',
 'cost',
 'Li',
 'Gigafactory',
 'system',
 'Company',
 'production',
 'revenue',
 'storage',
 'equity',
 'systems',
 'costs',
 'years',
 'PRC',
 'cash',
 'January',
 'Group',
 'loss',
 'management',
 'debt',
 'Party',
 'shares',
 'date',
 'company',
 'operating',
 'regulations',
 'Tesla',
 'Ñ',
 'development',
 'users',
 'laws',
 'Class',
 'ADSs',
 'price',
 'Agreement',
 'battery',
 'manufacturing',
 'customers',
 'liabilities',
 'Beijing',
 'Archives',
 'respect',
 'February',
 'statements']

In [179]:
with open('string_compare.txt', 'w') as f:
    for firm in ['Li', 'NIO', 'XPeng']:
        f.write(firm + '\n')
        f.write('  ' + 'shared' + '\n')

        if firm == 'Li':
            temp = string_compare(keyword_Tesla, keyword_Li)
            f.write(' '.join(x for x in temp[0]))
        if firm == 'NIO':
            temp = string_compare(keyword_Tesla, keyword_NIO)
            f.write(' '.join(x for x in temp[0]))
        if firm == 'XPeng':
            temp = string_compare(keyword_Tesla, keyword_XPeng)
            f.write(' '.join(x for x in temp[0]))

        f.write('\n  ' + 'difference' + '\n')

        if firm == 'Li':
            temp = string_compare(keyword_Tesla, keyword_Li)
            f.write(' '.join(x for x in temp[1]))
        if firm == 'NIO':
            temp = string_compare(keyword_Tesla, keyword_NIO)
            f.write(' '.join(x for x in temp[1]))
        if firm == 'XPeng':
            temp = string_compare(keyword_Tesla, keyword_XPeng)
            f.write(' '.join(x for x in temp[1]))

        f.write('\n\n\n')
    f.close()

2. Relation graph for these four new energy car manufacturers.
The graphs are stored in html file in folder `news_graph`.

In [186]:
import os
sys.path.insert(0, '/Users/feiteng/Documents/MGTF_venv/FinalProj/news_graph')
from news_graph.news_graph import NewsMining

In [202]:
# test
content = testGetter('Report/Li_AnnualReport_2020.txt', cut=100)
# whole text
# f = open('Report/Li_AnnualReport_2020.txt', 'r')
# content = f.read()
# f.close()
Miner = NewsMining()
Miner.main(content)
