In [2]:
"""This function creates a list of all possible subway strings for each of the subway lines.
For example, for the ace line, this returns a,c,e,ac,ae,ce,&ace"""
import itertools
def generate_subway_strings():
    is_subway = []
    subway_list = ["ace","bdfm","jz","nrqw","l","s","g","fg","em","jmz","ef","mr","bc",
                   "123","456","7","25", "34"]
    for subway in subway_list:
        #the longest string in subway list is 4 characters
        for i in range(1,5):
            for j in itertools.combinations(subway, i):
                subway_combination = ''.join(j)
                if subway_combination not in is_subway:
                    is_subway.append(subway_combination)
    return is_subway

In [10]:
import nltk
nltk.download('universal_tagset')
import csv

class TokensAndTags:
    
    def __init__(self, csv_file):
        """Requires one arguement, a csv file"""
        self.csv_file = csv_file
    
    def text_from_csv(self):
        """Reads the csv file and yields tweets. Assumes 3 columns. Strips apostrophes and slashes"""
        with open (self.csv_file, "r") as source:
            csv_reader = csv.reader(source)
            for _,_,text in csv_reader:
                text = text.replace("’","").replace("'","").replace("/","")
                yield text
    
    def sentence_boundry(self):
        """Uses nltk sentence tokenizer to return a list of sentences"""
        list_sentences = []
        s = "".join(self.text_from_csv())
        sentences = nltk.sent_tokenize(s)
        for sentence in sentences:
            if sentence in list_sentences:
                pass
            else:
                list_sentences.append(sentence)
        return list_sentences
    
    def make_tokens(self):
        """Returns a list of sentences, each of which is a list of tokens. Uses TweetTokenizer"""
        tokens = [] 
        for sentence in self.sentence_boundry():
            tokenized = nltk.tokenize.TweetTokenizer().tokenize(sentence)
            for token in tokenized:
                tokens.append(token)
        return tokens
    
    def get_tokens_tags(self):
        tokens = self.make_tokens()
        final_tokens = []
        final_tags = []
        for token, tag in nltk.pos_tag(tokens, tagset = 'universal'):
            final_tokens.append(token.casefold())
            final_tags.append(tag)
        return final_tokens,final_tags


[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/emilycampbell/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [11]:
mta_text = TokensAndTags("mta_subway_tweets.csv")
tokens,tags = mta_text.get_tokens_tags()

In [16]:
from collections import Counter
import csv

class Find_Trains:
    
    def __init__(self,tokens,tags):
        """The class takes two arguments, a list of tags and a list of tokens"""
        self.tokens = tokens
        self.tags = tags
        """This is a complete list of all the possible train combinations"""
        self.is_train = self.is_train = ['a', 'c', 'e', 'ac', 'ae', 'ce', 'ace', 
                         'b', 'd', 'f', 'm', 'bd', 'bf', 'bm', 'df', 'dm', 'fm', 'bdf', 'bdm', 'bfm', 'dfm', 'bdfm', 
                         'j', 'z', 'jz', 
                         'n', 'r', 'q', 'w', 'nr', 'nq', 'nw', 'rq', 'rw', 'qw', 'nrq', 'nrw', 'nqw', 'rqw', 'nrqw', 
                         'l', 's', 'g', 
                         'fg', 'em', 'jm', 'mz', 'jmz', 'ef', 'mr', 'bc',
                         '1', '2', '3', '12', '13', '23', '123', 
                         '4', '5', '6', '45', '46', '56', '456', 
                         '7', 
                         '25', '34']
        """This is only letter trains, excepting the A/a train which is dealt with below"""
        self.is_letter_train = frozenset(['c', 'e', 'ac', 'ae', 'ce', 'ace', 
                         'b', 'd', 'f', 'm', 'bd', 'bf', 'bm', 'df', 'dm', 'fm', 'bdf', 'bdm', 'bfm', 'dfm', 'bdfm', 
                         'j', 'z', 'jz', 
                         'n', 'r', 'q', 'w', 'nr', 'nq', 'nw', 'rq', 'rw', 'qw', 'nrq', 'nrw', 'nqw', 'rqw', 'nrqw', 
                         'l', 's', 'g', 
                         'fg', 'em', 'jm', 'mz', 'jmz', 'ef', 'mr', 'bc',])
        """This is only number trains"""
        self.is_number_train = frozenset(['1', '2', '3', '12', '13', '23', '123', 
                         '4', '5', '6', '45', '46', '56', '456', 
                         '7', 
                         '25', '34'])
        """This is only the A/a train"""
        self.is_a_train = frozenset(['a'])
        """This is a list of common train nouns that are used after train characters"""
        self.train_noun_list = ["train", "trains", "subway","line","lines", "stop", "station", 
                           "service", "platform", "track"]
        """This is a list of common train adjectives that are used before and after train characters"""
        self.train_adj_list = frozenset(["express", "exp","local","uptown", "downtown", "queensbound", 
                          "queens-bound" "southbound", "northbound", "southbound", "bound", 
                          "manhattan-bound","brooklyn-bound", "bronx-bound", "island-bound",
                          "astoria-bound","sb","nb"])
        """This is a list of common time words to filter out number trains from times"""
        self.times = frozenset(["seconds","minute","minutes","min","mins","hour","hours","hr","hrs",
                               "day","days","week","weeks","month","months","year","years","yrs"])
        
    
    def find_a_trains(self):
        """This function finds A/a trains and excludes instances of the indefinite article 'a'
        The following criteria are used (in order): 1) if the letter A comes after an adjective or determiner
        except 'such' 2) if the letter A comes after another train letter, which is followed by 
        punctuation or a conjuction 3) if the token before the letter A is included in the train
        adjective list 4) if the token after the letter A is included in the train adjective list 
        and is not followed by another train word or character"""
        trains = []
        tokens = self.tokens
        tags = self.tags
        """train_words is list which combines all the train characters and all the train nouns"""
        train_words = self.is_train + self.train_noun_list
        for i,token in enumerate(tokens):
            if token in self.is_a_train: 
                if i or i<len(tokens)-1:
                    if tags[i-1] in ['ADJ','DET'] and tokens[i-1] not in ["such", "a"]:
                        trains.append(token)
                    elif tags[i+1] in ["CONJ", "."] and tokens[i+2] in self.is_letter_train:
                        trains.append(token) 
                    elif tokens[i-1] in self.train_adj_list:
                        trains.append(token)
                    elif tokens[i+1] in self.train_adj_list and tokens[i+2] not in train_words:
                        trains.append(token)
                    else:
                        pass

        return trains
    
    def find_number_trains(self):
        """This function finds number trains and excludes instances of integers.
        The following criteria are used (in order): 1) if the comes before a verb 2) if the number
        after an adjective 3) if the number comes after a train adjective 4) if the number comes
        before a train adjective 5) if the number comes before a train noun 6) if there is another 
        train character followed by punctuation or a conjuction before the letter 7) if there is
        punctuation or a conjuction followed by another train character after the letter"""
        trains = []
        tokens = self.tokens
        tags = self.tags
        for i,token in enumerate(tokens):
            if token in self.is_number_train:
                if i or i<len(tokens)-1:
                    if tags[i+1] == "VERB":
                        trains.append(token)       
                    elif tags[i-1] == "ADJ" and tokens[i+1] not in self.times:
                        trains.append(token)
                    elif tokens[i-1] in self.train_adj_list:
                        trains.append(token)
                    elif tokens[i+1] in self.train_adj_list:
                        trains.append(token)
                    elif tokens[i+1] in self.train_noun_list:
                        trains.append(token)          
                    elif tags[i-1] in ["CONJ", "."]:
                        if tokens[i-2] in self.is_number_train:
                            trains.append(token) 
                    elif tags[i+1] in ["CONJ", "."]:
                        if tokens[i+2] in self.is_number_train:
                            trains.append(token) 
                    else:
                        pass

        return trains
    
    
    def find_letter_trains(self):
        """This function finds letter trains and excludes instances of other possible meanings
        including AC = air-conditioning, BC = because, E = east, W = west or with, F = fuck, 
        and S seems to have some weird results I think due to tokenization. """
        trains = []
        tokens = self.tokens
        tags = self.tags
        for i,token in enumerate(tokens):
            if token in self.is_letter_train:
                if i or i<len(tokens)-1:
                    
                    
                    if token == "w":
                        """disambiguates W trains from with and West street names"""
                        if any(char.isdigit() for char in tokens[i+1]) == True:
                            pass
                            """disambiguates W trains from with"""
                        elif tags[i-1] in ["ADJ", "DET"]:
                            trains.append(token)
                        elif tokens[i+1] in self.train_noun_list:
                            trains.append(token)
                        elif tokens[i+1] in self.train_adj_list:
                            trains.append(token)
                        elif tokens[i-1] in self.train_adj_list:
                            trains.append(token)
                        else:
                            pass    
                    
                    
                    elif token == "e":
                        """disambiguates E and W trains from East and West street names"""
                        if any(char.isdigit() for char in tokens[i+1]) == True:
                            pass
                        else:
                            trains.append(token)
                    
                    
                    elif token == "ac":
                        """disambiguates AC trains from air-conditioning"""
                        if tokens[i+1] in ["on", "off", "is", "blasting"]:
                            pass
                        else:
                            trains.append(token)
                    
                    
                    elif token in ["bc", "f", "s","dm"]:
                        """disambiguates BC trains from "because", F trains from "fuck" and S trains 
                        from tokenization (ie the s in "it s")"""
                        if tags[i-1] in ["ADJ", "DET"] and tokens[i+1] !="ing" and tokens[i-2] !="you":
                            trains.append(token)
                        elif tokens[i+1] in self.train_noun_list:
                            trains.append(token)
                        elif tokens[i+1] in self.train_adj_list:
                            trains.append(token)
                        elif tokens[i-1] in self.train_adj_list:
                            trains.append(token)
                        else:
                            pass
                    else:
                        trains.append(token)
        return trains
    

    
    def count_trains(self):
        """This function gathers the results of the previous 3 functions into one list"""
        trains = self.find_a_trains()+self.find_letter_trains()+self.find_number_trains()
        train_line_count = Counter(trains)
        return train_line_count.most_common()

In [17]:
tokens_and_tags = Find_Trains(tokens,tags)
tokens_and_tags.count_trains()

[('2', 241),
 ('4', 203),
 ('3', 182),
 ('5', 168),
 ('6', 163),
 ('1', 150),
 ('q', 133),
 ('f', 133),
 ('a', 131),
 ('m', 129),
 ('n', 123),
 ('7', 108),
 ('d', 107),
 ('r', 107),
 ('j', 103),
 ('b', 84),
 ('c', 83),
 ('e', 75),
 ('l', 50),
 ('g', 30),
 ('ac', 22),
 ('w', 21),
 ('456', 21),
 ('mr', 11),
 ('25', 11),
 ('ef', 9),
 ('s', 6),
 ('45', 6),
 ('z', 5),
 ('dm', 4),
 ('bc', 3),
 ('nrw', 3),
 ('fg', 3),
 ('ace', 3),
 ('bdfm', 3),
 ('rw', 2),
 ('46', 2),
 ('nw', 1),
 ('nq', 1),
 ('ce', 1),
 ('jz', 1)]