Merge pull request #112 from h1alexbel/install-requires

feat(#109): prepare fields in input.py, swap text_prediction for rf
h1alexbel · May 10, 2024 · fef5602 · fef5602 · 0pdd · May 10, 2024
2 parents 36311a7 + a0fa08d
commit fef5602
Show file tree

Hide file tree

Showing 9 changed files with 703 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,4 +18,5 @@ pipeline/
 .factorypath
 pyvenv.cfg
 .coverage
-*.csv
+predictions.csv
+out.csv
diff --git a/src/cli.py b/src/cli.py
@@ -27,6 +27,7 @@
 
 import typer
 
+from .pre_filter import PreFilter
 from .model_map import ModelMap
 from .filter_pipe import FilterPipe
 from src import NAME, VERSION
@@ -55,14 +56,8 @@ def filter(
     """
     Filter repositories.
     """
+    PreFilter(out).prepare()
     models = ModelMap().build()
-    # @todo #18:30min Find effective way for processing readme.
-    #  For now we are not processing readme because of
-    #  <a href="https://github.com/h1alexbel/samples-filter/issues/39">this</a>.
-    #  We need to find actual way to process readme too since it can be crucial
-    #  data as model input. Let's study papers, outlined
-    #  <a href="https://github.com/yegor256/cam/issues/227#issue-2200080559">here</a>
-    #  first, rethink it and try to implement here.
     FilterPipe(repositories, out, models.get(model), typer).apply()
 
 

diff --git a/src/feed.py b/src/feed.py
@@ -30,11 +30,10 @@ class Feed:
     def __init__(self, file):
         self.file = file
 
-    # @todo #105:60min Process all fields required as inputs.
-    #  We should process all fields required as inputs: full_name, readme,
-    #  created_at, last_commit. In case of transformer we should do it in a
-    #  prompt way, like repository advanced description. Check
-    #  <a href="https://github.com/h1alexbel/samples-filter/issues/75#issuecomment-2094153280">this</a>.
+    # @todo #109:90min Feed `readme`, `last_commit`, `created_at`, and `commits`.
+    #  We should feed other important fields too. For now we can feed readme,
+    #  but transformer model can't process it since input tensor is too big.
+    #  Let's resolve that problem and feed readme.
     def read(self):
         with open(self.file, "r") as input:
             csv.field_size_limit(2 * 1024 * 1024 * 1024)

diff --git a/src/filter_pipe.py b/src/filter_pipe.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 import csv
 
+from .input import Input
 from .feed import Feed
 from .text_prediction import TextPrediction
 
@@ -43,7 +44,7 @@ def __init__(self, repos, output, mdl, typer):
     def apply(self):
         instance = self.model()
         self.typer.echo(f"Filtering {self.repos} with {instance.name()}...")
-        feed = Feed(self.repos).read()
+        feed = Feed(Input(self.repos).copy()).read()
         with open("predictions.csv", "w") as predictions:
             writer = csv.DictWriter(
                 predictions,

diff --git a/src/input.py b/src/input.py
@@ -39,39 +39,22 @@ def copy(self):
                 pipe,
                 fieldnames=[
                     "full_name",
-                    "default_branch",
-                    "stars",
-                    "forks",
                     "created_at",
-                    "size",
-                    "open_issues_count",
                     "description",
-                    "topics",
                     "readme"
                 ]
             )
             writer.writeheader()
             for row in reader:
                 repo = row["full_name"]
                 branch = row["default_branch"]
-                stars = row["stars"]
-                forks = row["forks"]
                 created = row["created_at"]
-                size = row["size"]
-                issues = row["open_issues_count"]
                 description = row["description"]
-                topics = row["topics"]
                 readme = Readme(repo, branch).asText()
                 out = {
                     "full_name": repo,
-                    "default_branch": branch,
-                    "stars": stars,
-                    "forks": forks,
                     "created_at": created,
-                    "size": size,
-                    "open_issues_count": issues,
                     "description": description,
-                    "topics": topics,
                     "readme": readme
                 }
                 writer.writerow(out)

diff --git a/src/text_prediction.py b/src/text_prediction.py
@@ -32,7 +32,7 @@ def __init__(self, pred, name):
 
     def as_text(self):
         if self.model == "rf":
-            if self.pred == [0]:
+            if self.pred == [1]:
                 label = "sample"
             else:
                 label = "real"