In [21]:
import pandas as pd
import numpy as np
import sys
import string

In [5]:
import gensim
import re 
import networkx as nx

import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [165]:
def read_txt_file(filepath):
    f = open(filepath, 'r', encoding='utf8')
    txt = f.read()
    f.close()
    return txt

def txt_to_sentence(txt):
    sent_list = gensim.summarization.textcleaner.split_sentences(txt)
    new_sent_list = []
    for sentence in sent_list:
        sentence_temp = sentence.lower()
        new_sent_list.append(sentence_temp)
        
    return new_sent_list 

def remove_stopwords(sen):
    
    sentence_list = []
    for sentence in sen:
        sentence_new = ' '.join([re.sub('[^A-Za-z0-9]+', '', i) for i in sentence.split()])
        sentence_new = " ".join([i for i in sentence_new.split() if i not in stop_words])
        sentence_list.append(sentence_new)

    return sentence_list
    
def cosine_similarity(sent_1, sent_2):
    dot_product = np.dot(sent_1, sent_2)
    norm_sent_1 = np.linalg.norm(sent_1)
    norm_sent_2 = np.linalg.norm(sent_2)
    
    return dot_product / (norm_sent_1 * norm_sent_2)

def convert_to_sentence_vector(sentences_list): 
    sentence_matrix = np.array([])
    for sentence in sentences_list: 
        if len(sentence) != 0:
            sentence_vector = sum([word_embedding.get(word, np.zeros(300,)) for word in sentence.split()]) / len(sentence.split())
        else:
            sentence_vector = np.zeros(300,)
        sentence_matrix = np.append(sentence_matrix, sentence_vector)
    sentence_matrix = sentence_matrix.reshape(-1, 300)
    return sentence_matrix

def create_weigh_graph(sentence_matrix): 
    
    dim = sentence_matrix.shape[0]
    graph = np.zeros([dim, dim])
    
    for i in range(dim):
        for j in range(dim):
            if i != j: 
                graph[i][j] = cosine_similarity(sentence_matrix[i], sentence_matrix[j])
                
    return np.round(graph, 3)

In [None]:
word_embedding = {}

f = open('C:/Users/erict/OneDrive/桌面/project/glove.6B/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embedding[word] = coefs
f.close()

In [206]:
len(word_embedding)

400001

In [212]:
raw_txt = read_txt_file('reuters_news_2.txt')
raw_sentences_list = txt_to_sentence(raw_txt)
sentences_list = remove_stopwords(raw_sentences_list)

In [213]:
sentence_matrix = convert_to_sentence_vector(sentences_list)

In [214]:
sentence_matrix

array([[-0.05583567,  0.16225817, -0.06844132, ..., -0.2921087 ,
         0.039524  ,  0.0015288 ],
       [ 0.04907333,  0.186096  ,  0.06339499, ..., -0.03764349,
         0.14238466, -0.26509133],
       [-0.00231583,  0.15954583, -0.00567017, ...,  0.05344542,
        -0.12651099,  0.03477116],
       ...,
       [ 0.01159267,  0.02682467, -0.07696433, ..., -0.21714467,
        -0.04227406,  0.09052   ],
       [-0.12405426,  0.219503  ,  0.04452418, ..., -0.17727154,
        -0.105226  , -0.14149964],
       [-0.023623  ,  0.18105051,  0.10365775, ..., -0.23106501,
        -0.13980675, -0.2692875 ]])

In [215]:
weight_matrix = create_weigh_graph(sentence_matrix=sentence_matrix)
weight_matrix

array([[0.   , 0.616, 0.748, 0.853, 0.824, 0.726, 0.698, 0.78 , 0.826,
        0.637, 0.624, 0.579],
       [0.616, 0.   , 0.553, 0.626, 0.714, 0.492, 0.481, 0.599, 0.603,
        0.543, 0.486, 0.568],
       [0.748, 0.553, 0.   , 0.696, 0.7  , 0.546, 0.559, 0.638, 0.807,
        0.499, 0.513, 0.537],
       [0.853, 0.626, 0.696, 0.   , 0.834, 0.744, 0.761, 0.818, 0.848,
        0.589, 0.72 , 0.66 ],
       [0.824, 0.714, 0.7  , 0.834, 0.   , 0.783, 0.762, 0.819, 0.815,
        0.694, 0.66 , 0.69 ],
       [0.726, 0.492, 0.546, 0.744, 0.783, 0.   , 0.763, 0.748, 0.662,
        0.572, 0.61 , 0.593],
       [0.698, 0.481, 0.559, 0.761, 0.762, 0.763, 0.   , 0.798, 0.703,
        0.607, 0.708, 0.652],
       [0.78 , 0.599, 0.638, 0.818, 0.819, 0.748, 0.798, 0.   , 0.786,
        0.665, 0.707, 0.758],
       [0.826, 0.603, 0.807, 0.848, 0.815, 0.662, 0.703, 0.786, 0.   ,
        0.621, 0.656, 0.665],
       [0.637, 0.543, 0.499, 0.589, 0.694, 0.572, 0.607, 0.665, 0.621,
        0.   , 0.564

In [216]:
nx_graph = nx.from_numpy_array(weight_matrix)
scores = nx.pagerank(nx_graph)

In [219]:
print(raw_txt)

A major goal of President Donald Trump has been to drive manufacturers to bring work back to the United States, presumably aided by new automation and robotics that would allow domestic plants to compete with cheaper labor in China and other lower-cost countries. But that trend appears to have been overwhelmed by a larger slowdown in manufacturing.

Alexander Shikany, vice president of the Association for Advancing Automation, said the slowdown is likely to be short lived. Orders for new robots in North America, a separate measure that gives a sense of how many machines will be installed in future months, increased last year by 1.6% to 29,988 units, Shikany noted.

The largest driver of that growth was a more than 50% jump in orders from automakers, which Shikany said were making robots part of their investment in the next wave of automotive technology.

No. 1 U.S. automaker General Motors Co (GM.N), for example, recently announced it was investing $2.2 billion to build electric trucks

In [217]:
top_idx = [k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)][:5]
for idx in top_idx:
    print(raw_sentences_list[idx], '\n')

the largest driver of that growth was a more than 50% jump in orders from automakers, which shikany said were making robots part of their investment in the next wave of automotive technology. 

orders for new robots in north america, a separate measure that gives a sense of how many machines will be installed in future months, increased last year by 1.6% to 29,988 units, shikany noted. 

with demand from e-commerce businesses and other warehouse operations booming, the company spent $1.9 million last year to help automate its assembly line. 

david peacock, the company’s president, said the company realized three years ago it would have trouble keeping up with demand growth without more robots. 

a major goal of president donald trump has been to drive manufacturers to bring work back to the united states, presumably aided by new automation and robotics that would allow domestic plants to compete with cheaper labor in china and other lower-cost countries. 



In [218]:
summarized_content = gensim.summarization.summarize(' '.join(raw_sentences_list), ratio=0.3)
print(summarized_content)

orders for new robots in north america, a separate measure that gives a sense of how many machines will be installed in future months, increased last year by 1.6% to 29,988 units, shikany noted.
the largest driver of that growth was a more than 50% jump in orders from automakers, which shikany said were making robots part of their investment in the next wave of automotive technology.
hytrol conveyor co inc, a privately held company in jonesboro, arkansas, that produces conveyor belts and had sales last year of over $200 million, did not cut back on robot installations in 2019.
