In [1]:
corpus = ["Copyright (c) 1991-1995 Stichting Mathematisch Centrum Amsterdam The Netherlands",
          "Copyright (c) 1995-2001 Corporation for National Research Initiatives",
          "Copyright (c) 1995-2010 Free Software Foudation Inc",
          "Copyright (c) 2013 title from fe2.rs.github.com Github Inc.",
          "Copyright (c) 1999-2009, OW2 Consortium http://www.ow2.org/",
          "Copyright (c) 2002-2018 Pivotal, Inc.",
          "(c) September 2000",
          "(c) OoO.1 (c)",
          "U.S. Copyright Office",
          "Copyright Law",
          "Copyright Small Claims",
          "Search Copyright Records: Copyright Public Records Portal"]

corpus


['Copyright (c) 1991-1995 Stichting Mathematisch Centrum Amsterdam The Netherlands',
 'Copyright (c) 1995-2001 Corporation for National Research Initiatives',
 'Copyright (c) 1995-2010 Free Software Foudation Inc',
 'Copyright (c) 2013 title from fe2.rs.github.com Github Inc.',
 'Copyright (c) 1999-2009, OW2 Consortium http://www.ow2.org/',
 'Copyright (c) 2002-2018 Pivotal, Inc.',
 '(c) September 2000',
 '(c) OoO.1 (c)',
 'U.S. Copyright Office',
 'Copyright Law',
 'Copyright Small Claims',
 'Search Copyright Records: Copyright Public Records Portal']

In [2]:
from copyrightDet.match_string import MatchString

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os

with open(os.path.join(os.getcwd(), "data", "vocabulary.txt"), "r", encoding="utf8") as file:
        vocabulary = file.read().splitlines()

prePro = MatchString()
vectorizer = TfidfVectorizer(preprocessor=prePro.preprocess,
                             vocabulary=vocabulary,
                             ngram_range=(1, 4)
                             )

X = vectorizer.fit_transform(corpus)

print("Feature names:")
vectorizer.get_feature_names_out()


Feature names:


array(['corporation', 'corp', 'ltd', 'inc', 'foundation', 'author',
       'group', 'all rights reserved', 'this file is', 'enterprise',
       'incorporated', 'co', 'llc', 'detected_year',
       'detected_copyright detected_year',
       'detected_copyright detected_year detected_year',
       'detected_copyright detected_copyright detected_year',
       'detected_copyright detected_copyright detected_year detected_year',
       'gmbh', 'by',
       'detected_copyright detected_copyright detected_org detected_year',
       'detected_copyright detected_org detected_year',
       'detected_copyright detected_copyright detected_person detected_year',
       'detected_copyright detected_person detected_year', 'the',
       'or its affiliates', 'original author or authors',
       'detected_other_words', 'detected_copyright detected_org',
       'detected_copyright detected_copyright detected_org',
       'detected_copyright detected_person',
       'detected_copyright detected_copyright 

In [3]:

print("DataFrame:")
df = pd.DataFrame(X.toarray(), index=corpus, columns=vectorizer.get_feature_names_out())
df

DataFrame:


Unnamed: 0,corporation,corp,ltd,inc,foundation,author,group,all rights reserved,this file is,enterprise,...,the,or its affiliates,original author or authors,detected_other_words,detected_copyright detected_org,detected_copyright detected_copyright detected_org,detected_copyright detected_person,detected_copyright detected_copyright detected_person,word_between_copyright,detected_copyright the
Copyright (c) 1991-1995 Stichting Mathematisch Centrum Amsterdam The Netherlands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.526324,0.0,0.0,0.197943,0.0,0.0,0.0,0.0,0.0,0.0
Copyright (c) 1995-2001 Corporation for National Research Initiatives,0.526324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.197943,0.0,0.0,0.0,0.0,0.0,0.0
Copyright (c) 1995-2010 Free Software Foudation Inc,0.0,0.0,0.0,0.469398,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.205556,0.0,0.0,0.0,0.0,0.0,0.0
Copyright (c) 2013 title from fe2.rs.github.com Github Inc.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.367954,0.0,0.0,0.0,0.0,0.0,0.0
"Copyright (c) 1999-2009, OW2 Consortium http://www.ow2.org/",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.232796,0.0,0.0,0.0,0.0,0.0,0.0
"Copyright (c) 2002-2018 Pivotal, Inc.",0.0,0.0,0.0,0.469398,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.205556,0.0,0.0,0.0,0.0,0.0,0.0
(c) September 2000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.588055,0.0,0.0,0.0,0.0,0.0,0.0
(c) OoO.1 (c),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.401137,0.0,0.0,0.0,0.0,0.916018,0.0
U.S. Copyright Office,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Copyright Law,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
