In [1]:
import nltk
from nltk.corpus import stopwords
import nltk.data
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import zeros
import os
import re


nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/zhufeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/zhufeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Define some useful function for the next steps

In [2]:
def keyFunc(afilename):
    #A key function used to sort the file list
    m = re.search('(?<=Episode)\d{2}', afilename)
    return m.group(0)

def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    #ref : https://github.com/commonsense/metanl/blob/master/metanl/token_utils.py
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .', '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
        "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

def get_frequent_word(tfidf, episod):
    X = tfidf.transform([episod])
    tfidf_table = X.todense().tolist()[0]
    frequent_word = []
    candidate_number = 20
    for word in sorted(zip(tfidf_table,tfidf.get_feature_names()), reverse=True)[:candidate_number]:
        frequent_word.append(word[1])
    return frequent_word

def make_sentence (untokenized):
    #ref :https://github.com/thavelick/summarize/blob/master/summarize.py
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    normal_sentences = sent_detector.tokenize(untokenized) 
    working_sentences = [sentence.lower() for sentence in normal_sentences]
    return working_sentences, normal_sentences


def get_output_sentence(most_frequent_words, working_sentences, normal_sentences, num_phrase):
    temp_sentences = []
    output_sentences = []
    for word in most_frequent_words:
        for i in range(len(working_sentences)):
            if (word in working_sentences[i] and normal_sentences[i] not in temp_sentences):
                temp_sentences.append(normal_sentences[i]) 
    counter = zeros(len(temp_sentences))
    for i in range(len(temp_sentences)):
        for word in most_frequent_words:
            if word in temp_sentences[i]:
                counter[i]+=1        
    for sentence in sorted(zip(counter,temp_sentences), reverse=True)[:num_phrase]:
        output_sentences.append(sentence[1])
    return output_sentences

def reorder_sentences(output_sentences, original):
    ordered_output=[]
    for sentence in original:
        if sentence in output_sentences:
            ordered_output.append(sentence)
    return ordered_output

### Read the files and prepare the corpus for the next step

In [3]:
dir_path = './sous_titre/'
file_list = sorted(os.listdir(dir_path), key=keyFunc)
collection=[] #document collection
for file in file_list:
    file_path = os.path.join(dir_path, file)
    if os.path.isfile(file_path):
        with open(file_path,"rt") as f:
            lines=f.readlines()
            episod=[]
            for x in lines:
                episod.append(x.split(' ')[4])
        collection.append(episod)
        
untokenized_collection=[]
        
for episod in range(len(collection)):
    untokenized_collection.append(untokenize(collection[episod]))


print (untokenized_collection[0])


So if a photon is directed through a plane with two slits in it and either slit is observed, it will not go through both slits. If it's unobserved it will, however, if it's observed after it's left the plane but before it hits its target, it will not have gone through both slits. Agreed, what's your point? There's no point, I just think it's a good idea for a tee-shirt. Excuse me? Hang on. One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is... move your finger... phylum, which makes fourteen across Port-au-Prince. See, Papa Doc's capital idea, that's Port-au-Prince. Haiti. Can I help you? Yes. Um, is this the High IQ sperm bank? If you have to ask, maybe you shouldn't be here. I think this is the place. Fill these out. Thank-you. We'll be right back. Oh, take your time. I'll just finish my crossword puzzle. Oh, wait. Leonard, I don't think I can do this. What, are you kidding? You're a semi-pro. No. We are committing genetic fraud. There's no guarant

### Calculer le tf-idf and prepare the model

In [4]:
oral_words = ['oh', 'yes', 'no', 'hi', 'okay', 'uh', 'okay', 'bye', 'sorry', 'well', 'think','know','going','yeah'] #A compléter dans les tests
stop_words = stopwords.words('english') + list(punctuation) + oral_words
tfidf = TfidfVectorizer(stop_words=stop_words)
tfidf.fit(untokenized_collection[episod] for episod in range(len(untokenized_collection)))
print ('Model prepared! You can make your summarize now!')


Model prepared! You can make your summarize now!


### for changing the episod, just modify the index of untokenized_collection

In [5]:
most_frequent_words = get_frequent_word(tfidf, untokenized_collection[0])

working_sentences, normal_sentences = make_sentence (untokenized_collection[0])

num_sentences = 15

out=get_output_sentence(most_frequent_words, working_sentences, normal_sentences,num_sentences)

print (reorder_sentences(out, normal_sentences))

["If it's unobserved it will, however, if it's observed after it's left the plane but before it hits its target, it will not have gone through both slits.", 'I think we should be good neighbors, invite her over, make her feel welcome.', 'Who hasn\'t seen this differential below " here I sit broken hearted? "', 'Well- Just sit somewhere else.', "Yeah, it's like regular boggle, but in Klingon.", "Yes, it tells us that you participate in the mass cultural delusion that the Sun's apparent position relative to arbitrarily defined constellations and the time of your birth somehow effects your personality.", "Okay, let's see, what else, oh, I'm a vegetarian, oh, except for fish, and the occasional steak, I love steak.", 'You guys are really sweet.', "I'm not going to engage in hypotheticals here, I'm just trying to be a good neighbor.", 'Hey, is there a trick to getting it to switch from tub to shower?', "It's French for good shower.", "Why can't she get her own TV.", 'Come on, we have a comb

呜啦啦啦火车笛，做projet很忙滴。。大家唱起来！俗话说的好，彩蛋最后找，要是没找到。。。。。。。。。。。。。
















说明你小子根本没仔细看劳资辛辛苦苦写的代码！